Merge branch 'main' into fix/health-check-interval

2025-04-24 10:14:26 +00:00 · 2024-10-28 21:27:03 +01:00 · 2024-10-28 21:27:03 +01:00 · 322c7cd353
commit 322c7cd353
parent a4d3307a84 828631d6fc
136 changed files with 5845 additions and 3096 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -119,7 +119,7 @@ jobs:
          paths:
            - local_testing_coverage.xml
            - local_testing_coverage
-  ui_endpoint_testing:
+  auth_ui_unit_tests:
    docker:
      - image: cimg/python:3.11
        auth:
@ -161,8 +161,8 @@ jobs:
      - run:
          name: Rename the coverage files
          command: |
-            mv coverage.xml ui_endpoint_testing_coverage.xml
-            mv .coverage ui_endpoint_testing_coverage
+            mv coverage.xml auth_ui_unit_tests_coverage.xml
+            mv .coverage auth_ui_unit_tests_coverage

      # Store test results
      - store_test_results:
@ -171,8 +171,8 @@ jobs:
      - persist_to_workspace:
          root: .
          paths:
-            - ui_endpoint_testing_coverage.xml
-            - ui_endpoint_testing_coverage
+            - auth_ui_unit_tests_coverage.xml
+            - auth_ui_unit_tests_coverage
  litellm_router_testing: # Runs all tests with the "router" keyword
    docker:
        - image: cimg/python:3.11
@ -416,15 +416,17 @@ jobs:
          command: |
            python -m pip install --upgrade pip
            pip install ruff
-            pip install pylint  
+            pip install pylint
            pip install pyright
            pip install .
+            curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
      - run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
      - run: ruff check ./litellm
      - run: python ./tests/documentation_tests/test_general_setting_keys.py
      - run: python ./tests/code_coverage_tests/router_code_coverage.py
      - run: python ./tests/documentation_tests/test_env_keys.py
-    
+      - run: helm lint ./deploy/charts/litellm-helm
+
  db_migration_disable_update_check:
    machine:
      image: ubuntu-2204:2023.10.1
@ -811,7 +813,7 @@ jobs:
            python -m venv venv
            . venv/bin/activate
            pip install coverage
-            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage ui_endpoint_testing_coverage
+            coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage
            coverage xml
      - codecov/upload:
          file: ./coverage.xml
@ -1011,7 +1013,7 @@ workflows:
              only:
                - main
                - /litellm_.*/
-      - ui_endpoint_testing:
+      - auth_ui_unit_tests:
          filters:
            branches:
              only:
@ -1060,7 +1062,7 @@ workflows:
            - litellm_router_testing
            - local_testing
            - litellm_assistants_api_testing
-            - ui_endpoint_testing
+            - auth_ui_unit_tests
      - db_migration_disable_update_check:
          filters:
            branches:
@ -1088,7 +1090,7 @@ workflows:
            - logging_testing
            - litellm_router_testing
            - litellm_assistants_api_testing
-            - ui_endpoint_testing
+            - auth_ui_unit_tests
            - db_migration_disable_update_check
            - e2e_ui_testing
            - installing_litellm_on_python
@ -1099,4 +1101,4 @@ workflows:
            branches:
              only:
                - main
-      
+      
--- a/.github/workflows/ghcr_helm_deploy.yml
+++ b/.github/workflows/ghcr_helm_deploy.yml
@ -50,6 +50,9 @@ jobs:
          current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }}
          version-fragment: 'bug'

+      - name: Lint helm chart
+        run: helm lint deploy/charts/litellm-helm
+
      - uses: ./.github/actions/helm-oci-chart-releaser
        with:
          name: litellm-helm
@ -61,4 +64,4 @@ jobs:
          registry_username: ${{ github.actor }}
          registry_password: ${{ secrets.GITHUB_TOKEN }}
          update_dependencies: true
-  
+  
--- a/codecov.yaml
+++ b/codecov.yaml
@ -18,4 +18,15 @@ component_management:
      paths:
        - "*/proxy/auth/**"
 comment:
-  layout: "header, diff, flags, components"  # show component info in the PR comment
+  layout: "header, diff, flags, components"  # show component info in the PR comment
+
+coverage:
+  status:
+    project:
+      default:
+        target: auto
+        threshold: 1% # at maximum allow project coverage to drop by 1%
+    patch:
+      default:
+        target: auto
+        threshold: 0% # patch coverage should be 100%
--- a/cookbook/misc/clickhouse.py
+++ b/cookbook/misc/clickhouse.py
@ -1,72 +0,0 @@
-import clickhouse_connect
-import datetime as datetime
-import os
-
-client = clickhouse_connect.get_client(
-    host=os.getenv("CLICKHOUSE_HOST"),
-    port=int(os.getenv("CLICKHOUSE_PORT")),
-    username=os.getenv("CLICKHOUSE_USERNAME"),
-    password=os.getenv("CLICKHOUSE_PASSWORD"),
-)
-import clickhouse_connect
-
-row1 = [
-    "ishaan",  # request_id
-    "GET",  # call_type
-    "api_key_123",  # api_key
-    50.00,  # spend
-    1000,  # total_tokens
-    800,  # prompt_tokens
-    200,  # completion_tokens
-    datetime.datetime.now(),  # startTime (replace with the actual timestamp)
-    datetime.datetime.now(),  # endTime (replace with the actual timestamp)
-    "gpt-3.5",  # model
-    "user123",  # user
-    '{"key": "value"}',  # metadata (replace with valid JSON)
-    "True",  # cache_hit
-    "cache_key_123",  # cache_key
-    "tag1,tag2",  # request_tags
-]
-
-row2 = [
-    "jaffer",  # request_id
-    "POST",  # call_type
-    "api_key_456",  # api_key
-    30.50,  # spend
-    800,  # total_tokens
-    600,  # prompt_tokens
-    200,  # completion_tokens
-    datetime.datetime.now(),  # startTime (replace with the actual timestamp)
-    datetime.datetime.now(),  # endTime (replace with the actual timestamp)
-    "gpt-4.0",  # model
-    "user456",  # user
-    '{"key": "value"}',  # metadata (replace with valid JSON)
-    "False",  # cache_hit
-    "cache_key_789",  # cache_key
-    "tag3,tag4",  # request_tags
-]
-
-data = [row1, row2]
-resp = client.insert(
-    "spend_logs",
-    data,
-    column_names=[
-        "request_id",
-        "call_type",
-        "api_key",
-        "spend",
-        "total_tokens",
-        "prompt_tokens",
-        "completion_tokens",
-        "startTime",
-        "endTime",
-        "model",
-        "user",
-        "metadata",
-        "cache_hit",
-        "cache_key",
-        "request_tags",
-    ],
-)
-
-print(resp)
--- a/cookbook/misc/clickhouse_insert_logs.py
+++ b/cookbook/misc/clickhouse_insert_logs.py
@ -1,39 +0,0 @@
-# insert data into clickhouse
-# response = client.command(
-#     """
-#     CREATE TEMPORARY TABLE temp_spend_logs AS (
-#     SELECT
-#         generateUUIDv4() AS request_id,
-#         arrayElement(['TypeA', 'TypeB', 'TypeC'], rand() % 3 + 1) AS call_type,
-#         'ishaan' as api_key,
-#         rand() * 1000 AS spend,
-#         rand() * 100 AS total_tokens,
-#         rand() * 50 AS prompt_tokens,
-#         rand() * 50 AS completion_tokens,
-#         toDate('2024-02-01') + toIntervalDay(rand()%27) AS startTime,
-#         now() AS endTime,
-#         arrayElement(['azure/gpt-4', 'gpt-3.5', 'vertexai/gemini-pro', 'mistral/mistral-small', 'ollama/llama2'], rand() % 3 + 1) AS model,
-#         'ishaan-insert-rand' as user,
-#         'data' as metadata,
-#         'true'AS cache_hit,
-#         'ishaan' as cache_key,
-#         '{"tag1": "value1", "tag2": "value2"}' AS request_tags
-#     FROM numbers(1, 1000000)
-#     );
-#     """
-# )
-
-# client.command(
-#     """
-#     -- Insert data into spend_logs table
-#     INSERT INTO spend_logs
-#     SELECT * FROM temp_spend_logs;
-#     """
-# )
-
-
-# client.command(
-#     """
-#     DROP TABLE IF EXISTS temp_spend_logs;
-#     """
-# )
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -24,7 +24,7 @@ version: 0.3.0
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: v1.46.6
+appVersion: v1.50.2

 dependencies:
  - name: "postgresql"
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -28,14 +28,13 @@ If `db.useStackgresOperator` is used (not yet implemented):
 | `image.repository`                                         | LiteLLM Proxy image repository                                                                                                                                                        | `ghcr.io/berriai/litellm`  |
 | `image.pullPolicy`                                         | LiteLLM Proxy image pull policy                                                                                                                                                       | `IfNotPresent`  |
 | `image.tag`                                                | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published.                                                                             | `""`  |
-| `image.dbReadyImage`                                       | On Pod startup, an initContainer is used to make sure the Postgres database is available before attempting to start LiteLLM.  This field specifies the image to use as that initContainer.  | `docker.io/bitnami/postgresql`  |
-| `image.dbReadyTag`                                         | Tag for the above image.  If not specified, "latest" is used.                                                                                                                         | `""`  |
 | `imagePullSecrets`                                         | Registry credentials for the LiteLLM and initContainer images.                                                                                                                        | `[]`  |
 | `serviceAccount.create`                                    | Whether or not to create a Kubernetes Service Account for this deployment.  The default is `false` because LiteLLM has no need to access the Kubernetes API.                          | `false`  |
 | `service.type`                                             | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.)                                                                                                                      | `ClusterIP`  |
 | `service.port`                                             | TCP port that the Kubernetes Service will listen on.  Also the TCP port within the Pod that the proxy will listen on.                                                                 | `4000`  |
 | `ingress.*`                                                | See [values.yaml](./values.yaml) for example settings                                                                                                                                 | N/A  |
 | `proxy_config.*`                                           | See [values.yaml](./values.yaml) for default settings.  See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples.                            | N/A  |
+| `extraContainers[]`                                        | An array of additional containers to be deployed as sidecars alongside the LiteLLM Proxy.                                                                                             | `[]`  |

 #### Example `environmentSecrets` Secret 

@ -127,4 +126,4 @@ kubectl -n litellm get secret <RELEASE>-litellm-masterkey -o jsonpath="{.data.ma
 At the time of writing, the Admin UI is unable to add models.  This is because
 it would need to update the `config.yaml` file which is a exposed ConfigMap, and
 therefore, read-only.  This is a limitation of this helm chart, not the Admin UI
-itself.
+itself.
--- a/deploy/charts/litellm-helm/templates/deployment.yaml
+++ b/deploy/charts/litellm-helm/templates/deployment.yaml
@ -31,71 +31,6 @@ spec:
      serviceAccountName: {{ include "litellm.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
-      initContainers:
-        - name: db-ready
-          securityContext:
-            {{- toYaml .Values.securityContext | nindent 12 }}
-          image: "{{ .Values.image.dbReadyImage }}:{{ .Values.image.dbReadyTag | default("16.1.0-debian-11-r20") }}"
-          imagePullPolicy: {{ .Values.image.pullPolicy }}
-          env:
-            {{- if .Values.db.deployStandalone }}
-            - name: DATABASE_USERNAME
-              valueFrom:
-                secretKeyRef:
-                  name: {{ include "litellm.fullname" . }}-dbcredentials
-                  key: username
-            - name: PGPASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: {{ include "litellm.fullname" . }}-dbcredentials
-                  key: password
-            - name: DATABASE_HOST
-              value: {{ .Release.Name }}-postgresql
-            - name: DATABASE_NAME
-              value: litellm
-            {{- else if .Values.db.useExisting }}
-            - name: DATABASE_USERNAME
-              valueFrom:
-                secretKeyRef:
-                  name: {{ .Values.db.secret.name }}
-                  key: {{ .Values.db.secret.usernameKey }}
-            - name: PGPASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: {{ .Values.db.secret.name }}
-                  key: {{ .Values.db.secret.passwordKey }}
-            - name: DATABASE_HOST
-              value: {{ .Values.db.endpoint }}
-            - name: DATABASE_NAME
-              value: {{ .Values.db.database }}
-            {{- end }}
-          command:
-            - sh
-            - -c
-            - |
-              # Maximum wait time will be (limit * 2) seconds.
-              limit=60
-              current=0
-              ret=1
-              while [ $current -lt $limit ] && [ $ret -ne 0 ]; do
-                echo "Waiting for database to be ready $current"
-                psql -U $(DATABASE_USERNAME) -h $(DATABASE_HOST) -l
-                ret=$?
-                current=$(( $current + 1 ))
-                sleep 2
-              done
-              if [ $ret -eq 0 ]; then
-                echo "Database is ready"
-              else
-                echo "Database failed to become ready before we gave up waiting."
-              fi
-          resources:
-            {{- toYaml .Values.resources | nindent 12 }}
-          {{ if .Values.securityContext.readOnlyRootFilesystem }}
-          volumeMounts:
-            - name: tmp
-              mountPath: /tmp
-          {{ end }}
      containers:
        - name: {{ include "litellm.name" . }}
          securityContext:
@ -203,6 +138,9 @@ spec:
          {{- with .Values.volumeMounts }}
            {{- toYaml . | nindent 12 }}
          {{- end }}
+      {{- with .Values.extraContainers }}
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
      volumes:
        {{ if .Values.securityContext.readOnlyRootFilesystem }}
        - name: tmp
@ -235,4 +173,4 @@ spec:
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
-      {{- end }}
+      {{- end }}
--- a/deploy/charts/litellm-helm/values.yaml
+++ b/deploy/charts/litellm-helm/values.yaml
@ -7,16 +7,11 @@ replicaCount: 1
 image:
  # Use "ghcr.io/berriai/litellm-database" for optimized image with database
  repository: ghcr.io/berriai/litellm-database
-  pullPolicy: IfNotPresent
+  pullPolicy: Always
  # Overrides the image tag whose default is the chart appVersion.
  # tag: "main-latest"
  tag: ""

-  # Image and tag used for the init container to check and wait for the
-  #  readiness of the postgres database.
-  dbReadyImage: docker.io/bitnami/postgresql
-  dbReadyTag: ""
-
 imagePullSecrets: []
 nameOverride: "litellm"
 fullnameOverride: ""
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -84,6 +84,60 @@ print(query_result[:5])
 </TabItem>
 </Tabs>

+
+## Image Embeddings
+
+For models that support image embeddings, you can pass in a base64 encoded image string to the `input` param.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import embedding
+import os
+
+# set your api key
+os.environ["COHERE_API_KEY"] = ""
+
+response = embedding(model="cohere/embed-english-v3.0", input=["<base64 encoded image>"])
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: cohere-embed
+    litellm_params:
+      model: cohere/embed-english-v3.0
+      api_key: os.environ/COHERE_API_KEY
+```
+
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml 
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/v1/embeddings' \
+-H 'Authorization: Bearer sk-54d77cd67b9febbb' \
+-H 'Content-Type: application/json' \
+-d '{
+  "model": "cohere/embed-english-v3.0",
+  "input": ["<base64 encoded image>"]
+}'
+```
+</TabItem>
+</Tabs>
+
 ## Input Params for `litellm.embedding()`


--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -62,7 +62,8 @@ litellm_settings:
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
-    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize api endpoint
+    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
+    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
 ```

 ## Support & Talk to Founders
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -9,12 +9,11 @@ LiteLLM requires `boto3` to be installed on your system for Bedrock requests
 pip install boto3>=1.28.57
 ```

-## Required Environment Variables
-```python
-os.environ["AWS_ACCESS_KEY_ID"] = ""  # Access key
-os.environ["AWS_SECRET_ACCESS_KEY"] = "" # Secret access key
-os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
-```
+:::info
+
+LiteLLM uses boto3 to handle authentication. All these options are supported - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#credentials.
+
+:::

 ## Usage

@ -22,6 +21,7 @@ os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

+
 ```python
 import os
 from litellm import completion
@ -38,7 +38,7 @@ response = completion(

 ## LiteLLM Proxy Usage 

-Here's how to call Anthropic with the LiteLLM Proxy Server
+Here's how to call Bedrock with the LiteLLM Proxy Server

 ### 1. Setup config.yaml

--- a/docs/my-website/docs/proxy/cli.md
+++ b/docs/my-website/docs/proxy/cli.md
@ -135,7 +135,7 @@ Cli arguments,  --host, --port, --num_workers
     ```

 ## --request_timeout
-   - **Default:** `600`
+   - **Default:** `6000`
   - **Type:** `int`
   - Set the timeout in seconds for completion calls.
   - **Usage:** 
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -625,6 +625,7 @@ litellm_settings:
  redact_user_api_key_info: boolean  # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
  langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging
  
+  request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout 
  
  set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION
  json_logs: boolean # if true, logs will be in json format
@ -721,6 +722,7 @@ general_settings:
 | set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION |
 | json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) |
 | default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) |
+| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) |
 | content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) |
 | context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) |
 | cache | boolean | If true, enables caching. [Further docs](./caching) |
@ -812,6 +814,7 @@ general_settings:
 | pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
 | enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
 | forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
+| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |

 ### router_settings - Reference

@ -898,10 +901,6 @@ router_settings:
 | BRAINTRUST_API_KEY | API key for Braintrust integration
 | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI
 | CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI
-| CLICKHOUSE_HOST | Host for ClickHouse database
-| CLICKHOUSE_PASSWORD | Password for ClickHouse authentication
-| CLICKHOUSE_PORT | Port for ClickHouse database connection
-| CLICKHOUSE_USERNAME | Username for ClickHouse authentication
 | CONFIG_FILE_PATH | File path for configuration file
 | CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache
 | DATABASE_HOST | Hostname for the database server
@ -919,6 +918,7 @@ router_settings:
 | DD_API_KEY | API key for Datadog integration
 | DD_SITE | Site URL for Datadog (e.g., datadoghq.com)
 | DD_SOURCE | Source identifier for Datadog logs
+| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback
 | DEBUG_OTEL | Enable debug mode for OpenTelemetry
 | DIRECT_URL | Direct URL for service endpoint
 | DISABLE_ADMIN_UI | Toggle to disable the admin UI
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -57,4 +57,34 @@ model_list:
      api_version: os.envrion/AZURE_API_VERSION
      input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token
      output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token
-```
+```
+
+### Debugging 
+
+If you're custom pricing is not being used or you're seeing errors, please check the following:
+
+1. Run the proxy with `LITELLM_LOG="DEBUG"` or the `--detailed_debug` cli flag
+
+```bash
+litellm --config /path/to/config.yaml --detailed_debug
+```
+
+2. Check logs for this line: 
+
+```
+LiteLLM:DEBUG: utils.py:263 - litellm.acompletion
+```
+
+3. Check if 'input_cost_per_token' and 'output_cost_per_token' are top-level keys in the acompletion function. 
+
+```bash
+acompletion(
+  ...,
+  input_cost_per_token: my-custom-price, 
+  output_cost_per_token: my-custom-price,
+)
+```
+
+If these keys are not present, LiteLLM will not use your custom pricing. 
+
+If the problem persists, please file an issue on [GitHub](https://github.com/BerriAI/litellm/issues). 
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1279,7 +1279,8 @@ litellm_settings:
 environment_variables:
    ARIZE_SPACE_KEY: "d0*****"
    ARIZE_API_KEY: "141a****"
-    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize api endpoint
+    ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint
+    ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT
 ```

 2. Start Proxy
@ -1467,6 +1468,13 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \

 ## Logging Proxy Input/Output - DataDog

+LiteLLM Supports logging to the following Datdog Integrations:
+- `datadog` [Datadog Logs](https://docs.datadoghq.com/logs/)
+- `datadog_llm_observability` [Datadog LLM Observability](https://www.datadoghq.com/product/llm-observability/)
+
+<Tabs>
+<TabItem value="datadog" label="Datadog Logs">
+
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

 **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
@ -1481,6 +1489,21 @@ litellm_settings:
  service_callback: ["datadog"] # logs redis, postgres failures on datadog
 ```

+</TabItem>
+<TabItem value="datadog_llm_observability" label="Datadog LLM Observability">
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  callbacks: ["datadog_llm_observability"] # logs llm success logs on datadog
+```
+
+</TabItem>
+</Tabs>
+
 **Step 2**: Set Required env variables for datadog

 ```shell
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -21,6 +21,7 @@ general_settings:
  database_connection_pool_limit: 10 # limit the number of database connections to = MAX Number of DB Connections/Number of instances of litellm proxy (Around 10-20 is good number)

 litellm_settings:
+  request_timeout: 600    # raise Timeout error if call takes longer than 600 seconds. Default value is 6000seconds if not set
  set_verbose: False      # Switch off Debug Logging, ensure your logs do not have any debugging on
  json_logs: true         # Get debug logs in json format
 ```
--- a/docs/my-website/docs/realtime.md
+++ b/docs/my-website/docs/realtime.md
@ -83,4 +83,21 @@ ws.on("message", function incoming(message) {
 ws.on("error", function handleError(error) {
    console.error("Error: ", error);
 });
-```
+```
+
+## Logging 
+
+To prevent requests from being dropped, by default LiteLLM just logs these event types:
+
+- `session.created`
+- `response.create`
+- `response.done`
+
+You can override this by setting the `logged_real_time_event_types` parameter in the config. For example:
+
+```yaml
+litellm_settings:
+  logged_real_time_event_types: "*" # Log all events
+  ## OR ## 
+  logged_real_time_event_types: ["session.created", "response.create", "response.done"] # Log only these event types
+```
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@ -1312,7 +1312,7 @@ LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw Open
     ```

 #### --request_timeout
-   - **Default:** `600`
+   - **Default:** `6000`
   - **Type:** `int`
   - Set the timeout in seconds for completion calls.
   - **Usage:** 
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
@ -12447,9 +12447,9 @@
      }
    },
    "node_modules/http-proxy-middleware": {
-      "version": "2.0.6",
-      "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.6.tgz",
-      "integrity": "sha512-ya/UeJ6HVBYxrgYotAZo1KvPWlgB48kUJLDePFeneHsVujFaW5WNj2NgWCAE//B1Dl02BIfYlpNgBy8Kf8Rjmw==",
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz",
+      "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==",
      "dependencies": {
        "@types/http-proxy": "^1.17.8",
        "http-proxy": "^1.18.1",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,6 +8,7 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
+from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm._logging import (
    set_verbose,
    _turn_on_debug,
@ -48,6 +49,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "langsmith",
    "prometheus",
    "datadog",
+    "datadog_llm_observability",
    "galileo",
    "braintrust",
    "arize",
@ -56,6 +58,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "opik",
    "argilla",
 ]
+logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
 _known_custom_logger_compatible_callbacks: List = list(
    get_args(_custom_logger_compatible_callbacks_literal)
 )
@ -79,6 +82,9 @@ turn_off_message_logging: Optional[bool] = False
 log_raw_request_response: bool = False
 redact_messages_in_exceptions: Optional[bool] = False
 redact_user_api_key_info: Optional[bool] = False
+add_user_information_to_llm_headers: Optional[bool] = (
+    None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
+)
 store_audit_logs = False  # Enterprise feature, allow users to see audit logs
 ## end of callbacks #############

@ -132,7 +138,7 @@ enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
 AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
-COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
+COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
@ -159,9 +165,6 @@ enable_caching_on_provider_specific_optional_params: bool = (
 caching: bool = (
    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )
-always_read_redis: bool = (
-    True  # always use redis for rate limiting logic on litellm proxy
-)
 caching_with_models: bool = (
    False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )
--- a/litellm/_redis.py
+++ b/litellm/_redis.py
@ -69,6 +69,8 @@ def _get_redis_cluster_kwargs(client=None):

    available_args = [x for x in arg_spec.args if x not in exclude_args]
    available_args.append("password")
+    available_args.append("username")
+    available_args.append("ssl")

    return available_args

--- a/litellm/caching/caching.py
+++ b/litellm/caching/caching.py
@ -233,7 +233,7 @@ class Cache:
        if self.namespace is not None and isinstance(self.cache, RedisCache):
            self.cache.namespace = self.namespace

-    def get_cache_key(self, *args, **kwargs) -> str:  # noqa: PLR0915
+    def get_cache_key(self, *args, **kwargs) -> str:
        """
        Get the cache key for the given arguments.

--- a/litellm/caching/dual_cache.py
+++ b/litellm/caching/dual_cache.py
@ -32,7 +32,6 @@ class DualCache(BaseCache):
        redis_cache: Optional[RedisCache] = None,
        default_in_memory_ttl: Optional[float] = None,
        default_redis_ttl: Optional[float] = None,
-        always_read_redis: Optional[bool] = True,
    ) -> None:
        super().__init__()
        # If in_memory_cache is not provided, use the default InMemoryCache
@ -44,7 +43,6 @@ class DualCache(BaseCache):
            default_in_memory_ttl or litellm.default_in_memory_ttl
        )
        self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
-        self.always_read_redis = always_read_redis

    def update_cache_ttl(
        self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
@ -102,12 +100,8 @@ class DualCache(BaseCache):
                if in_memory_result is not None:
                    result = in_memory_result

-            if (
-                (self.always_read_redis is True)
-                and self.redis_cache is not None
-                and local_only is False
-            ):
-                # If not found in in-memory cache or always_read_redis is True, try fetching from Redis
+            if result is None and self.redis_cache is not None and local_only is False:
+                # If not found in in-memory cache, try fetching from Redis
                redis_result = self.redis_cache.get_cache(key, **kwargs)

                if redis_result is not None:
--- a/litellm/integrations/aispend.py
+++ b/litellm/integrations/aispend.py
@ -1,167 +0,0 @@
-#### What this does ####
-#    On success + failure, log events to aispend.io
-import datetime
-import os
-import traceback
-
-import dotenv
-
-model_cost = {
-    "gpt-3.5-turbo": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-35-turbo": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },  # azure model name
-    "gpt-3.5-turbo-0613": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-3.5-turbo-0301": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-3.5-turbo-16k": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },
-    "gpt-35-turbo-16k": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },  # azure model name
-    "gpt-3.5-turbo-16k-0613": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },
-    "gpt-4": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.00006,
-    },
-    "gpt-4-0613": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.00006,
-    },
-    "gpt-4-32k": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.00006,
-        "output_cost_per_token": 0.00012,
-    },
-    "claude-instant-1": {
-        "max_tokens": 100000,
-        "input_cost_per_token": 0.00000163,
-        "output_cost_per_token": 0.00000551,
-    },
-    "claude-2": {
-        "max_tokens": 100000,
-        "input_cost_per_token": 0.00001102,
-        "output_cost_per_token": 0.00003268,
-    },
-    "text-bison-001": {
-        "max_tokens": 8192,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000004,
-    },
-    "chat-bison-001": {
-        "max_tokens": 4096,
-        "input_cost_per_token": 0.000002,
-        "output_cost_per_token": 0.000002,
-    },
-    "command-nightly": {
-        "max_tokens": 4096,
-        "input_cost_per_token": 0.000015,
-        "output_cost_per_token": 0.000015,
-    },
-}
-
-
-class AISpendLogger:
-    # Class variables or attributes
-    def __init__(self):
-        # Instance variables
-        self.account_id = os.getenv("AISPEND_ACCOUNT_ID")
-        self.api_key = os.getenv("AISPEND_API_KEY")
-
-    def price_calculator(self, model, response_obj, start_time, end_time):
-        # try and find if the model is in the model_cost map
-        # else default to the average of the costs
-        prompt_tokens_cost_usd_dollar = 0
-        completion_tokens_cost_usd_dollar = 0
-        if model in model_cost:
-            prompt_tokens_cost_usd_dollar = (
-                model_cost[model]["input_cost_per_token"]
-                * response_obj["usage"]["prompt_tokens"]
-            )
-            completion_tokens_cost_usd_dollar = (
-                model_cost[model]["output_cost_per_token"]
-                * response_obj["usage"]["completion_tokens"]
-            )
-        elif "replicate" in model:
-            # replicate models are charged based on time
-            # llama 2 runs on an nvidia a100 which costs $0.0032 per second - https://replicate.com/replicate/llama-2-70b-chat
-            model_run_time = end_time - start_time  # assuming time in seconds
-            cost_usd_dollar = model_run_time * 0.0032
-            prompt_tokens_cost_usd_dollar = cost_usd_dollar / 2
-            completion_tokens_cost_usd_dollar = cost_usd_dollar / 2
-        else:
-            # calculate average input cost
-            input_cost_sum = 0
-            output_cost_sum = 0
-            for model in model_cost:
-                input_cost_sum += model_cost[model]["input_cost_per_token"]
-                output_cost_sum += model_cost[model]["output_cost_per_token"]
-            prompt_tokens_cost_usd_dollar = (
-                model_cost[model]["input_cost_per_token"]
-                * response_obj["usage"]["prompt_tokens"]
-            )
-            completion_tokens_cost_usd_dollar = (
-                model_cost[model]["output_cost_per_token"]
-                * response_obj["usage"]["completion_tokens"]
-            )
-        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
-
-    def log_event(self, model, response_obj, start_time, end_time, print_verbose):
-        # Method definition
-        try:
-            print_verbose(
-                f"AISpend Logging - Enters logging function for model {model}"
-            )
-
-            response_timestamp = datetime.datetime.fromtimestamp(
-                int(response_obj["created"])
-            ).strftime("%Y-%m-%d")
-
-            (
-                prompt_tokens_cost_usd_dollar,
-                completion_tokens_cost_usd_dollar,
-            ) = self.price_calculator(model, response_obj, start_time, end_time)
-            prompt_tokens_cost_usd_cent = prompt_tokens_cost_usd_dollar * 100
-            completion_tokens_cost_usd_cent = completion_tokens_cost_usd_dollar * 100
-            data = [
-                {
-                    "requests": 1,
-                    "requests_context": 1,
-                    "context_tokens": response_obj["usage"]["prompt_tokens"],
-                    "requests_generated": 1,
-                    "generated_tokens": response_obj["usage"]["completion_tokens"],
-                    "recorded_date": response_timestamp,
-                    "model_id": response_obj["model"],
-                    "generated_tokens_cost_usd_cent": prompt_tokens_cost_usd_cent,
-                    "context_tokens_cost_usd_cent": completion_tokens_cost_usd_cent,
-                }
-            ]
-
-            print_verbose(f"AISpend Logging - final data object: {data}")
-        except Exception:
-            print_verbose(f"AISpend Logging Error - {traceback.format_exc()}")
-            pass
--- a/litellm/integrations/arize_ai.py
+++ b/litellm/integrations/arize_ai.py
@ -7,135 +7,208 @@ this file has Arize ai specific helper functions
 import json
 from typing import TYPE_CHECKING, Any, Optional, Union

-from litellm._logging import verbose_proxy_logger
+from litellm._logging import verbose_logger

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span

+    from .opentelemetry import OpenTelemetryConfig as _OpenTelemetryConfig
+
    Span = _Span
+    OpenTelemetryConfig = _OpenTelemetryConfig
 else:
    Span = Any
+    OpenTelemetryConfig = Any
+
+import os
+
+from litellm.types.integrations.arize import *


-def make_json_serializable(payload: dict) -> dict:
-    for key, value in payload.items():
+class ArizeLogger:
+    @staticmethod
+    def set_arize_ai_attributes(span: Span, kwargs, response_obj):
+        from litellm.integrations._types.open_inference import (
+            MessageAttributes,
+            MessageContentAttributes,
+            OpenInferenceSpanKindValues,
+            SpanAttributes,
+        )
+
        try:
-            if isinstance(value, dict):
-                # recursively sanitize dicts
-                payload[key] = make_json_serializable(value.copy())
-            elif not isinstance(value, (str, int, float, bool, type(None))):
-                # everything else becomes a string
-                payload[key] = str(value)
-        except Exception:
-            # non blocking if it can't cast to a str
+
+            optional_params = kwargs.get("optional_params", {})
+            # litellm_params = kwargs.get("litellm_params", {}) or {}
+
+            #############################################
+            ############ LLM CALL METADATA ##############
+            #############################################
+            # commented out for now - looks like Arize AI could not log this
+            # metadata = litellm_params.get("metadata", {}) or {}
+            # span.set_attribute(SpanAttributes.METADATA, str(metadata))
+
+            #############################################
+            ########## LLM Request Attributes ###########
+            #############################################
+
+            # The name of the LLM a request is being made to
+            if kwargs.get("model"):
+                span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
+
+            span.set_attribute(
+                SpanAttributes.OPENINFERENCE_SPAN_KIND,
+                OpenInferenceSpanKindValues.LLM.value,
+            )
+            messages = kwargs.get("messages")
+
+            # for /chat/completions
+            # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
+            if messages:
+                span.set_attribute(
+                    SpanAttributes.INPUT_VALUE,
+                    messages[-1].get("content", ""),  # get the last message for input
+                )
+
+                # LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
+                for idx, msg in enumerate(messages):
+                    # Set the role per message
+                    span.set_attribute(
+                        f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
+                        msg["role"],
+                    )
+                    # Set the content per message
+                    span.set_attribute(
+                        f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
+                        msg.get("content", ""),
+                    )
+
+            # The Generative AI Provider: Azure, OpenAI, etc.
+            _optional_params = ArizeLogger.make_json_serializable(optional_params)
+            _json_optional_params = json.dumps(_optional_params)
+            span.set_attribute(
+                SpanAttributes.LLM_INVOCATION_PARAMETERS, _json_optional_params
+            )
+
+            if optional_params.get("user"):
+                span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
+
+            #############################################
+            ########## LLM Response Attributes ##########
+            # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
+            #############################################
+            for choice in response_obj.get("choices"):
+                response_message = choice.get("message", {})
+                span.set_attribute(
+                    SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
+                )
+
+                # This shows up under `output_messages` tab on the span page
+                # This code assumes a single response
+                span.set_attribute(
+                    f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
+                    response_message["role"],
+                )
+                span.set_attribute(
+                    f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
+                    response_message.get("content", ""),
+                )
+
+            usage = response_obj.get("usage")
+            if usage:
+                span.set_attribute(
+                    SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
+                    usage.get("total_tokens"),
+                )
+
+                # The number of tokens used in the LLM response (completion).
+                span.set_attribute(
+                    SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
+                    usage.get("completion_tokens"),
+                )
+
+                # The number of tokens used in the LLM prompt.
+                span.set_attribute(
+                    SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
+                    usage.get("prompt_tokens"),
+                )
            pass
-    return payload
+        except Exception as e:
+            verbose_logger.error(f"Error setting arize attributes: {e}")

+    ###################### Helper functions ######################

-def set_arize_ai_attributes(span: Span, kwargs, response_obj):
-    from litellm.integrations._types.open_inference import (
-        MessageAttributes,
-        MessageContentAttributes,
-        OpenInferenceSpanKindValues,
-        SpanAttributes,
-    )
+    @staticmethod
+    def _get_arize_config() -> ArizeConfig:
+        """
+        Helper function to get Arize configuration.

-    try:
+        Returns:
+            ArizeConfig: A Pydantic model containing Arize configuration.

-        optional_params = kwargs.get("optional_params", {})
-        # litellm_params = kwargs.get("litellm_params", {}) or {}
+        Raises:
+            ValueError: If required environment variables are not set.
+        """
+        space_key = os.environ.get("ARIZE_SPACE_KEY")
+        api_key = os.environ.get("ARIZE_API_KEY")

-        #############################################
-        ############ LLM CALL METADATA ##############
-        #############################################
-        # commented out for now - looks like Arize AI could not log this
-        # metadata = litellm_params.get("metadata", {}) or {}
-        # span.set_attribute(SpanAttributes.METADATA, str(metadata))
+        if not space_key:
+            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
+        if not api_key:
+            raise ValueError("ARIZE_API_KEY not found in environment variables")

-        #############################################
-        ########## LLM Request Attributes ###########
-        #############################################
-
-        # The name of the LLM a request is being made to
-        if kwargs.get("model"):
-            span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
-
-        span.set_attribute(
-            SpanAttributes.OPENINFERENCE_SPAN_KIND,
-            OpenInferenceSpanKindValues.LLM.value,
-        )
-        messages = kwargs.get("messages")
-
-        # for /chat/completions
-        # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
-        if messages:
-            span.set_attribute(
-                SpanAttributes.INPUT_VALUE,
-                messages[-1].get("content", ""),  # get the last message for input
+        grpc_endpoint = os.environ.get("ARIZE_ENDPOINT")
+        http_endpoint = os.environ.get("ARIZE_HTTP_ENDPOINT")
+        if grpc_endpoint is None and http_endpoint is None:
+            # use default arize grpc endpoint
+            verbose_logger.debug(
+                "No ARIZE_ENDPOINT or ARIZE_HTTP_ENDPOINT found, using default endpoint: https://otlp.arize.com/v1"
            )
+            grpc_endpoint = "https://otlp.arize.com/v1"

-            # LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
-            for idx, msg in enumerate(messages):
-                # Set the role per message
-                span.set_attribute(
-                    f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
-                    msg["role"],
-                )
-                # Set the content per message
-                span.set_attribute(
-                    f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
-                    msg.get("content", ""),
-                )
-
-        # The Generative AI Provider: Azure, OpenAI, etc.
-        _optional_params = make_json_serializable(optional_params)
-        _json_optional_params = json.dumps(_optional_params)
-        span.set_attribute(
-            SpanAttributes.LLM_INVOCATION_PARAMETERS, _json_optional_params
+        return ArizeConfig(
+            space_key=space_key,
+            api_key=api_key,
+            grpc_endpoint=grpc_endpoint,
+            http_endpoint=http_endpoint,
        )

-        if optional_params.get("user"):
-            span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
+    @staticmethod
+    def get_arize_opentelemetry_config() -> Optional[OpenTelemetryConfig]:
+        """
+        Helper function to get OpenTelemetry configuration for Arize.

-        #############################################
-        ########## LLM Response Attributes ##########
-        # https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
-        #############################################
-        for choice in response_obj.get("choices"):
-            response_message = choice.get("message", {})
-            span.set_attribute(
-                SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
+        Args:
+            arize_config (ArizeConfig): Arize configuration object.
+
+        Returns:
+            OpenTelemetryConfig: Configuration for OpenTelemetry.
+        """
+        from .opentelemetry import OpenTelemetryConfig
+
+        arize_config = ArizeLogger._get_arize_config()
+        if arize_config.http_endpoint:
+            return OpenTelemetryConfig(
+                exporter="otlp_http",
+                endpoint=arize_config.http_endpoint,
            )

-            # This shows up under `output_messages` tab on the span page
-            # This code assumes a single response
-            span.set_attribute(
-                f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
-                response_message["role"],
-            )
-            span.set_attribute(
-                f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
-                response_message.get("content", ""),
-            )
+        # use default arize grpc endpoint
+        return OpenTelemetryConfig(
+            exporter="otlp_grpc",
+            endpoint=arize_config.grpc_endpoint,
+        )

-        usage = response_obj.get("usage")
-        if usage:
-            span.set_attribute(
-                SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
-                usage.get("total_tokens"),
-            )
-
-            # The number of tokens used in the LLM response (completion).
-            span.set_attribute(
-                SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
-                usage.get("completion_tokens"),
-            )
-
-            # The number of tokens used in the LLM prompt.
-            span.set_attribute(
-                SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
-                usage.get("prompt_tokens"),
-            )
-        pass
-    except Exception as e:
-        verbose_proxy_logger.error(f"Error setting arize attributes: {e}")
+    @staticmethod
+    def make_json_serializable(payload: dict) -> dict:
+        for key, value in payload.items():
+            try:
+                if isinstance(value, dict):
+                    # recursively sanitize dicts
+                    payload[key] = ArizeLogger.make_json_serializable(value.copy())
+                elif not isinstance(value, (str, int, float, bool, type(None))):
+                    # everything else becomes a string
+                    payload[key] = str(value)
+            except Exception:
+                # non blocking if it can't cast to a str
+                pass
+        return payload
--- a/litellm/integrations/berrispend.py
+++ b/litellm/integrations/berrispend.py
@ -1,104 +0,0 @@
-#### What this does ####
-#    On success + failure, log events to aispend.io
-import datetime
-import os
-import traceback
-
-import dotenv
-import requests  # type: ignore
-
-model_cost = {
-    "gpt-3.5-turbo": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-35-turbo": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },  # azure model name
-    "gpt-3.5-turbo-0613": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-3.5-turbo-0301": {
-        "max_tokens": 4000,
-        "input_cost_per_token": 0.0000015,
-        "output_cost_per_token": 0.000002,
-    },
-    "gpt-3.5-turbo-16k": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },
-    "gpt-35-turbo-16k": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },  # azure model name
-    "gpt-3.5-turbo-16k-0613": {
-        "max_tokens": 16000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000004,
-    },
-    "gpt-4": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.00006,
-    },
-    "gpt-4-0613": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.00006,
-    },
-    "gpt-4-32k": {
-        "max_tokens": 8000,
-        "input_cost_per_token": 0.00006,
-        "output_cost_per_token": 0.00012,
-    },
-    "claude-instant-1": {
-        "max_tokens": 100000,
-        "input_cost_per_token": 0.00000163,
-        "output_cost_per_token": 0.00000551,
-    },
-    "claude-2": {
-        "max_tokens": 100000,
-        "input_cost_per_token": 0.00001102,
-        "output_cost_per_token": 0.00003268,
-    },
-    "text-bison-001": {
-        "max_tokens": 8192,
-        "input_cost_per_token": 0.000004,
-        "output_cost_per_token": 0.000004,
-    },
-    "chat-bison-001": {
-        "max_tokens": 4096,
-        "input_cost_per_token": 0.000002,
-        "output_cost_per_token": 0.000002,
-    },
-    "command-nightly": {
-        "max_tokens": 4096,
-        "input_cost_per_token": 0.000015,
-        "output_cost_per_token": 0.000015,
-    },
-}
-
-
-class BerriSpendLogger:
-    # Class variables or attributes
-    def __init__(self):
-        # Instance variables
-        self.account_id = os.getenv("BERRISPEND_ACCOUNT_ID")
-
-    def price_calculator(self, model, response_obj, start_time, end_time):
-        return
-
-    def log_event(
-        self, model, messages, response_obj, start_time, end_time, print_verbose
-    ):
-        """
-        This integration is not implemented yet.
-        """
-        return
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -1,334 +0,0 @@
-# callback to make a request to an API endpoint
-
-#### What this does ####
-#    On success, logs events to Promptlayer
-import datetime
-import json
-import os
-import traceback
-from typing import Literal, Optional, Union
-
-import dotenv
-import requests
-
-import litellm
-from litellm._logging import verbose_logger
-from litellm.caching.caching import DualCache
-from litellm.proxy._types import UserAPIKeyAuth
-from litellm.types.utils import StandardLoggingPayload
-
-#### What this does ####
-#    On success + failure, log events to Supabase
-
-
-def create_client():
-    try:
-        import clickhouse_connect
-
-        port = os.getenv("CLICKHOUSE_PORT")
-        clickhouse_host = os.getenv("CLICKHOUSE_HOST")
-        if clickhouse_host is not None:
-            verbose_logger.debug("setting up clickhouse")
-
-            port = os.getenv("CLICKHOUSE_PORT")
-            if port is not None and isinstance(port, str):
-                port = int(port)
-
-            host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
-            if host is None:
-                raise ValueError("CLICKHOUSE_HOST is not set")
-
-            username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
-            if username is None:
-                raise ValueError("CLICKHOUSE_USERNAME is not set")
-
-            password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
-            if password is None:
-                raise ValueError("CLICKHOUSE_PASSWORD is not set")
-            if port is None:
-                raise ValueError("CLICKHOUSE_PORT is not set")
-
-            client = clickhouse_connect.get_client(
-                host=host,
-                port=port,
-                username=username,
-                password=password,
-            )
-            return client
-        else:
-            raise Exception("Clickhouse: Clickhouse host not set")
-    except Exception as e:
-        raise ValueError(f"Clickhouse: {e}")
-
-
-def build_daily_metrics():
-    click_house_client = create_client()
-
-    # get daily spend
-    daily_spend = click_house_client.query_df(
-        """
-        SELECT sumMerge(DailySpend) as daily_spend, day FROM daily_aggregated_spend GROUP BY day
-        """
-    )
-
-    # get daily spend per model
-    daily_spend_per_model = click_house_client.query_df(
-        """
-        SELECT sumMerge(DailySpend) as daily_spend, day, model FROM daily_aggregated_spend_per_model GROUP BY day, model
-        """
-    )
-    new_df = daily_spend_per_model.to_dict(orient="records")
-    import pandas as pd
-
-    df = pd.DataFrame(new_df)
-    # Group by 'day' and create a dictionary for each group
-    result_dict = {}
-    for day, group in df.groupby("day"):
-        models = group["model"].tolist()
-        spend = group["daily_spend"].tolist()
-        spend_per_model = {model: spend for model, spend in zip(models, spend)}
-        result_dict[day] = spend_per_model
-
-    # Display the resulting dictionary
-
-    # get daily spend per API key
-    daily_spend_per_api_key = click_house_client.query_df(
-        """
-            SELECT
-                daily_spend,
-                day,
-                api_key
-            FROM (
-                SELECT
-                    sumMerge(DailySpend) as daily_spend,
-                    day,
-                    api_key,
-                    RANK() OVER (PARTITION BY day ORDER BY sumMerge(DailySpend) DESC) as spend_rank
-                FROM
-                    daily_aggregated_spend_per_api_key
-                GROUP BY
-                    day,
-                    api_key
-            ) AS ranked_api_keys
-            WHERE
-                spend_rank <= 5
-                AND day IS NOT NULL
-            ORDER BY
-                day,
-                daily_spend DESC
-        """
-    )
-    new_df = daily_spend_per_api_key.to_dict(orient="records")
-    import pandas as pd
-
-    df = pd.DataFrame(new_df)
-    # Group by 'day' and create a dictionary for each group
-    api_key_result_dict = {}
-    for day, group in df.groupby("day"):
-        api_keys = group["api_key"].tolist()
-        spend = group["daily_spend"].tolist()
-        spend_per_api_key = {api_key: spend for api_key, spend in zip(api_keys, spend)}
-        api_key_result_dict[day] = spend_per_api_key
-
-    # Display the resulting dictionary
-
-    # Calculate total spend across all days
-    total_spend = daily_spend["daily_spend"].sum()
-
-    # Identify top models and top API keys with the highest spend across all days
-    top_models = {}
-    top_api_keys = {}
-
-    for day, spend_per_model in result_dict.items():
-        for model, model_spend in spend_per_model.items():
-            if model not in top_models or model_spend > top_models[model]:
-                top_models[model] = model_spend
-
-    for day, spend_per_api_key in api_key_result_dict.items():
-        for api_key, api_key_spend in spend_per_api_key.items():
-            if api_key not in top_api_keys or api_key_spend > top_api_keys[api_key]:
-                top_api_keys[api_key] = api_key_spend
-
-    # for each day in daily spend, look up the day in result_dict and api_key_result_dict
-    # Assuming daily_spend DataFrame has 'day' column
-    result = []
-    for index, row in daily_spend.iterrows():
-        day = row["day"]
-        data_day = row.to_dict()
-
-        # Look up in result_dict
-        if day in result_dict:
-            spend_per_model = result_dict[day]
-            # Assuming there is a column named 'model' in daily_spend
-            data_day["spend_per_model"] = spend_per_model  # Assign 0 if model not found
-
-        # Look up in api_key_result_dict
-        if day in api_key_result_dict:
-            spend_per_api_key = api_key_result_dict[day]
-            # Assuming there is a column named 'api_key' in daily_spend
-            data_day["spend_per_api_key"] = spend_per_api_key
-
-        result.append(data_day)
-
-    data_to_return = {}
-    data_to_return["daily_spend"] = result
-
-    data_to_return["total_spend"] = total_spend
-    data_to_return["top_models"] = top_models
-    data_to_return["top_api_keys"] = top_api_keys
-    return data_to_return
-
-
-# build_daily_metrics()
-
-
-def _start_clickhouse():
-    import clickhouse_connect
-
-    port = os.getenv("CLICKHOUSE_PORT")
-    clickhouse_host = os.getenv("CLICKHOUSE_HOST")
-    if clickhouse_host is not None:
-        verbose_logger.debug("setting up clickhouse")
-        if port is not None and isinstance(port, str):
-            port = int(port)
-
-        port = os.getenv("CLICKHOUSE_PORT")
-        if port is not None and isinstance(port, str):
-            port = int(port)
-
-        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
-        if host is None:
-            raise ValueError("CLICKHOUSE_HOST is not set")
-
-        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
-        if username is None:
-            raise ValueError("CLICKHOUSE_USERNAME is not set")
-
-        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
-        if password is None:
-            raise ValueError("CLICKHOUSE_PASSWORD is not set")
-        if port is None:
-            raise ValueError("CLICKHOUSE_PORT is not set")
-
-        client = clickhouse_connect.get_client(
-            host=host,
-            port=port,
-            username=username,
-            password=password,
-        )
-        # view all tables in DB
-        response = client.query("SHOW TABLES")
-        verbose_logger.debug(
-            f"checking if litellm spend logs exists, all tables={response.result_rows}"
-        )
-        # all tables is returned like this: all tables = [('new_table',), ('spend_logs',)]
-        # check if spend_logs in all tables
-        table_names = [all_tables[0] for all_tables in response.result_rows]
-
-        if "spend_logs" not in table_names:
-            verbose_logger.debug(
-                "Clickhouse: spend logs table does not exist... creating it"
-            )
-
-            response = client.command(
-                """
-                CREATE TABLE default.spend_logs
-                (
-                    `request_id` String,
-                    `call_type` String,
-                    `api_key` String,
-                    `spend` Float64,
-                    `total_tokens` Int256,
-                    `prompt_tokens` Int256,
-                    `completion_tokens` Int256,
-                    `startTime` DateTime,
-                    `endTime` DateTime,
-                    `model` String,
-                    `user` String,
-                    `metadata` String,
-                    `cache_hit` String,
-                    `cache_key` String,
-                    `request_tags` String
-                )
-                ENGINE = MergeTree
-                ORDER BY tuple();
-                """
-            )
-        else:
-            # check if spend logs exist, if it does then return the schema
-            response = client.query("DESCRIBE default.spend_logs")
-            verbose_logger.debug(f"spend logs schema ={response.result_rows}")
-
-
-class ClickhouseLogger:
-    # Class variables or attributes
-    def __init__(self, endpoint=None, headers=None):
-        import clickhouse_connect
-
-        _start_clickhouse()
-
-        verbose_logger.debug(
-            f"ClickhouseLogger init, host {os.getenv('CLICKHOUSE_HOST')}, port {os.getenv('CLICKHOUSE_PORT')}, username {os.getenv('CLICKHOUSE_USERNAME')}"
-        )
-
-        port = os.getenv("CLICKHOUSE_PORT")
-        if port is not None and isinstance(port, str):
-            port = int(port)
-
-        host: Optional[str] = os.getenv("CLICKHOUSE_HOST")
-        if host is None:
-            raise ValueError("CLICKHOUSE_HOST is not set")
-
-        username: Optional[str] = os.getenv("CLICKHOUSE_USERNAME")
-        if username is None:
-            raise ValueError("CLICKHOUSE_USERNAME is not set")
-
-        password: Optional[str] = os.getenv("CLICKHOUSE_PASSWORD")
-        if password is None:
-            raise ValueError("CLICKHOUSE_PASSWORD is not set")
-        if port is None:
-            raise ValueError("CLICKHOUSE_PORT is not set")
-
-        client = clickhouse_connect.get_client(
-            host=host,
-            port=port,
-            username=username,
-            password=password,
-        )
-        self.client = client
-
-    # This is sync, because we run this in a separate thread. Running in a sepearate thread ensures it will never block an LLM API call
-    # Experience with s3, Langfuse shows that async logging events are complicated and can block LLM calls
-    def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
-    ):
-        try:
-            verbose_logger.debug(
-                f"ClickhouseLogger Logging - Enters logging function for model {kwargs}"
-            )
-            # follows the same params as langfuse.py
-
-            payload: Optional[StandardLoggingPayload] = kwargs.get(
-                "standard_logging_object"
-            )
-            if payload is None:
-                return
-            # Build the initial payload
-
-            verbose_logger.debug(f"\nClickhouse Logger - Logging payload = {payload}")
-
-            # just get the payload items in one array and payload keys in 2nd array
-            values = []
-            keys = []
-            for key, value in payload.items():
-                keys.append(key)
-                values.append(value)
-            data = [values]
-
-            response = self.client.insert("default.spend_logs", data, column_names=keys)
-
-            # make request to endpoint with payload
-            verbose_logger.debug(f"Clickhouse Logger - final response = {response}")
-        except Exception as e:
-            verbose_logger.debug(f"Clickhouse - {str(e)}\n{traceback.format_exc()}")
-            pass
--- a/litellm/integrations/datadog/datadog_llm_obs.py
+++ b/litellm/integrations/datadog/datadog_llm_obs.py
@ -0,0 +1,169 @@
+"""
+Implements logging integration with Datadog's LLM Observability Service
+
+
+API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
+
+"""
+
+import asyncio
+import os
+import traceback
+import uuid
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Union
+
+from httpx import Response
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.custom_batch_logger import CustomBatchLogger
+from litellm.llms.custom_httpx.http_handler import (
+    get_async_httpx_client,
+    httpxSpecialProvider,
+)
+from litellm.types.integrations.datadog_llm_obs import *
+from litellm.types.utils import StandardLoggingPayload
+
+
+class DataDogLLMObsLogger(CustomBatchLogger):
+    def __init__(self, **kwargs):
+        try:
+            verbose_logger.debug("DataDogLLMObs: Initializing logger")
+            if os.getenv("DD_API_KEY", None) is None:
+                raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
+            if os.getenv("DD_SITE", None) is None:
+                raise Exception(
+                    "DD_SITE is not set, set 'DD_SITE=<>', example sit = `us5.datadoghq.com`"
+                )
+
+            self.async_client = get_async_httpx_client(
+                llm_provider=httpxSpecialProvider.LoggingCallback
+            )
+            self.DD_API_KEY = os.getenv("DD_API_KEY")
+            self.DD_SITE = os.getenv("DD_SITE")
+            self.intake_url = (
+                f"https://api.{self.DD_SITE}/api/intake/llm-obs/v1/trace/spans"
+            )
+
+            # testing base url
+            dd_base_url = os.getenv("DD_BASE_URL")
+            if dd_base_url:
+                self.intake_url = f"{dd_base_url}/api/intake/llm-obs/v1/trace/spans"
+
+            asyncio.create_task(self.periodic_flush())
+            self.flush_lock = asyncio.Lock()
+            self.log_queue: List[LLMObsPayload] = []
+            super().__init__(**kwargs, flush_lock=self.flush_lock)
+        except Exception as e:
+            verbose_logger.exception(f"DataDogLLMObs: Error initializing - {str(e)}")
+            raise e
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        try:
+            verbose_logger.debug(
+                f"DataDogLLMObs: Logging success event for model {kwargs.get('model', 'unknown')}"
+            )
+            payload = self.create_llm_obs_payload(
+                kwargs, response_obj, start_time, end_time
+            )
+            verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
+            self.log_queue.append(payload)
+
+            if len(self.log_queue) >= self.batch_size:
+                await self.async_send_batch()
+        except Exception as e:
+            verbose_logger.exception(
+                f"DataDogLLMObs: Error logging success event - {str(e)}"
+            )
+
+    async def async_send_batch(self):
+        try:
+            if not self.log_queue:
+                return
+
+            verbose_logger.debug(
+                f"DataDogLLMObs: Flushing {len(self.log_queue)} events"
+            )
+
+            # Prepare the payload
+            payload = {
+                "data": DDIntakePayload(
+                    type="span",
+                    attributes=DDSpanAttributes(
+                        ml_app="litellm",
+                        tags=[
+                            "service:litellm",
+                            f"env:{os.getenv('DD_ENV', 'production')}",
+                        ],
+                        spans=self.log_queue,
+                    ),
+                ),
+            }
+
+            response = await self.async_client.post(
+                url=self.intake_url,
+                json=payload,
+                headers={
+                    "DD-API-KEY": self.DD_API_KEY,
+                    "Content-Type": "application/json",
+                },
+            )
+
+            response.raise_for_status()
+            if response.status_code != 202:
+                raise Exception(
+                    f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
+                )
+
+            verbose_logger.debug(
+                f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
+            )
+            self.log_queue.clear()
+        except Exception as e:
+            verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
+
+    def create_llm_obs_payload(
+        self, kwargs: Dict, response_obj: Any, start_time: datetime, end_time: datetime
+    ) -> LLMObsPayload:
+        standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
+            "standard_logging_object"
+        )
+        if standard_logging_payload is None:
+            raise Exception("DataDogLLMObs: standard_logging_object is not set")
+
+        messages = standard_logging_payload["messages"]
+        metadata = kwargs.get("litellm_params", {}).get("metadata", {})
+
+        input_meta = InputMeta(messages=messages)  # type: ignore
+        output_meta = OutputMeta(messages=self._get_response_messages(response_obj))
+
+        meta = Meta(kind="llm", input=input_meta, output=output_meta)
+
+        # Calculate metrics (you may need to adjust these based on available data)
+        metrics = LLMMetrics(
+            input_tokens=float(standard_logging_payload.get("prompt_tokens", 0)),
+            output_tokens=float(standard_logging_payload.get("completion_tokens", 0)),
+            total_tokens=float(standard_logging_payload.get("total_tokens", 0)),
+        )
+
+        return LLMObsPayload(
+            parent_id=metadata.get("parent_id", "undefined"),
+            trace_id=metadata.get("trace_id", str(uuid.uuid4())),
+            span_id=metadata.get("span_id", str(uuid.uuid4())),
+            name=metadata.get("name", "litellm_llm_call"),
+            meta=meta,
+            start_ns=int(start_time.timestamp() * 1e9),
+            duration=int((end_time - start_time).total_seconds() * 1e9),
+            metrics=metrics,
+        )
+
+    def _get_response_messages(self, response_obj: Any) -> List[Any]:
+        """
+        Get the messages from the response object
+
+        for now this handles logging /chat/completions responses
+        """
+        if isinstance(response_obj, litellm.ModelResponse):
+            return [response_obj["choices"][0]["message"].json()]
+        return []
--- a/litellm/integrations/langfuse/langfuse.py
+++ b/litellm/integrations/langfuse/langfuse.py
@ -4,7 +4,7 @@ import copy
 import inspect
 import os
 import traceback
-from typing import Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional

 from packaging.version import Version
 from pydantic import BaseModel
@ -13,7 +13,13 @@ import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 from litellm.secret_managers.main import str_to_bool
-from litellm.types.utils import StandardLoggingPayload
+from litellm.types.integrations.langfuse import *
+from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
+else:
+    DynamicLoggingCache = Any


 class LangFuseLogger:
--- a/litellm/integrations/langfuse/langfuse_handler.py
+++ b/litellm/integrations/langfuse/langfuse_handler.py
@ -0,0 +1,168 @@
+"""
+This file contains the LangFuseHandler class
+
+Used to get the LangFuseLogger for a given request
+
+Handles Key/Team Based Langfuse Logging
+"""
+
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from litellm.litellm_core_utils.litellm_logging import StandardCallbackDynamicParams
+
+from .langfuse import LangFuseLogger, LangfuseLoggingConfig
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import DynamicLoggingCache
+else:
+    DynamicLoggingCache = Any
+
+
+class LangFuseHandler:
+
+    @staticmethod
+    def get_langfuse_logger_for_request(
+        standard_callback_dynamic_params: StandardCallbackDynamicParams,
+        in_memory_dynamic_logger_cache: DynamicLoggingCache,
+        globalLangfuseLogger: Optional[LangFuseLogger] = None,
+    ) -> LangFuseLogger:
+        """
+        This function is used to get the LangFuseLogger for a given request
+
+        1. If dynamic credentials are passed
+            - check if a LangFuseLogger is cached for the dynamic credentials
+            - if cached LangFuseLogger is not found, create a new LangFuseLogger and cache it
+
+        2. If dynamic credentials are not passed return the globalLangfuseLogger
+
+        """
+        temp_langfuse_logger: Optional[LangFuseLogger] = globalLangfuseLogger
+        if (
+            LangFuseHandler._dynamic_langfuse_credentials_are_passed(
+                standard_callback_dynamic_params
+            )
+            is False
+        ):
+            return LangFuseHandler._return_global_langfuse_logger(
+                globalLangfuseLogger=globalLangfuseLogger,
+                in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
+            )
+
+        # get langfuse logging config to use for this request, based on standard_callback_dynamic_params
+        _credentials = LangFuseHandler.get_dynamic_langfuse_logging_config(
+            globalLangfuseLogger=globalLangfuseLogger,
+            standard_callback_dynamic_params=standard_callback_dynamic_params,
+        )
+        credentials_dict = dict(_credentials)
+
+        # check if langfuse logger is already cached
+        temp_langfuse_logger = in_memory_dynamic_logger_cache.get_cache(
+            credentials=credentials_dict, service_name="langfuse"
+        )
+
+        # if not cached, create a new langfuse logger and cache it
+        if temp_langfuse_logger is None:
+            temp_langfuse_logger = (
+                LangFuseHandler._create_langfuse_logger_from_credentials(
+                    credentials=credentials_dict,
+                    in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
+                )
+            )
+
+        return temp_langfuse_logger
+
+    @staticmethod
+    def _return_global_langfuse_logger(
+        globalLangfuseLogger: Optional[LangFuseLogger],
+        in_memory_dynamic_logger_cache: DynamicLoggingCache,
+    ) -> LangFuseLogger:
+        """
+        Returns the Global LangfuseLogger set on litellm
+
+        (this is the default langfuse logger - used when no dynamic credentials are passed)
+
+        If no Global LangfuseLogger is set, it will check in_memory_dynamic_logger_cache for a cached LangFuseLogger
+        This function is used to return the globalLangfuseLogger if it exists, otherwise it will check in_memory_dynamic_logger_cache for a cached LangFuseLogger
+        """
+        if globalLangfuseLogger is not None:
+            return globalLangfuseLogger
+
+        credentials_dict: Dict[str, Any] = (
+            {}
+        )  # the global langfuse logger uses Environment Variables, there are no dynamic credentials
+        globalLangfuseLogger = in_memory_dynamic_logger_cache.get_cache(
+            credentials=credentials_dict,
+            service_name="langfuse",
+        )
+        if globalLangfuseLogger is None:
+            globalLangfuseLogger = (
+                LangFuseHandler._create_langfuse_logger_from_credentials(
+                    credentials=credentials_dict,
+                    in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
+                )
+            )
+        return globalLangfuseLogger
+
+    @staticmethod
+    def _create_langfuse_logger_from_credentials(
+        credentials: Dict,
+        in_memory_dynamic_logger_cache: DynamicLoggingCache,
+    ) -> LangFuseLogger:
+        """
+        This function is used to
+        1. create a LangFuseLogger from the credentials
+        2. cache the LangFuseLogger to prevent re-creating it for the same credentials
+        """
+
+        langfuse_logger = LangFuseLogger(
+            langfuse_public_key=credentials.get("langfuse_public_key"),
+            langfuse_secret=credentials.get("langfuse_secret"),
+            langfuse_host=credentials.get("langfuse_host"),
+        )
+        in_memory_dynamic_logger_cache.set_cache(
+            credentials=credentials,
+            service_name="langfuse",
+            logging_obj=langfuse_logger,
+        )
+        return langfuse_logger
+
+    @staticmethod
+    def get_dynamic_langfuse_logging_config(
+        standard_callback_dynamic_params: StandardCallbackDynamicParams,
+        globalLangfuseLogger: Optional[LangFuseLogger] = None,
+    ) -> LangfuseLoggingConfig:
+        """
+        This function is used to get the Langfuse logging config to use for a given request.
+
+        It checks if the dynamic parameters are provided in the standard_callback_dynamic_params and uses them to get the Langfuse logging config.
+
+        If no dynamic parameters are provided, it uses the `globalLangfuseLogger` values
+        """
+        # only use dynamic params if langfuse credentials are passed dynamically
+        return LangfuseLoggingConfig(
+            langfuse_secret=standard_callback_dynamic_params.get("langfuse_secret")
+            or standard_callback_dynamic_params.get("langfuse_secret_key"),
+            langfuse_public_key=standard_callback_dynamic_params.get(
+                "langfuse_public_key"
+            ),
+            langfuse_host=standard_callback_dynamic_params.get("langfuse_host"),
+        )
+
+    @staticmethod
+    def _dynamic_langfuse_credentials_are_passed(
+        standard_callback_dynamic_params: StandardCallbackDynamicParams,
+    ) -> bool:
+        """
+        This function is used to check if the dynamic langfuse credentials are passed in standard_callback_dynamic_params
+
+        Returns:
+            bool: True if the dynamic langfuse credentials are passed, False otherwise
+        """
+        if (
+            standard_callback_dynamic_params.get("langfuse_host") is not None
+            or standard_callback_dynamic_params.get("langfuse_public_key") is not None
+            or standard_callback_dynamic_params.get("langfuse_secret") is not None
+            or standard_callback_dynamic_params.get("langfuse_secret_key") is not None
+        ):
+            return True
+        return False
--- a/litellm/integrations/litedebugger.py
+++ b/litellm/integrations/litedebugger.py
@ -1,77 +0,0 @@
-import json
-import os
-import traceback
-import types
-
-import requests
-
-
-class LiteDebugger:
-    user_email = None
-    dashboard_url = None
-
-    def __init__(self, email=None):
-        self.api_url = "https://api.litellm.ai/debugger"
-        self.validate_environment(email)
-        pass
-
-    def validate_environment(self, email):
-        try:
-            self.user_email = (
-                email or os.getenv("LITELLM_TOKEN") or os.getenv("LITELLM_EMAIL")
-            )
-            if (
-                self.user_email is None
-            ):  # if users are trying to use_client=True but token not set
-                raise ValueError(
-                    "litellm.use_client = True but no token or email passed. Please set it in litellm.token"
-                )
-            self.dashboard_url = "https://admin.litellm.ai/" + self.user_email
-            if self.user_email is None:
-                raise ValueError(
-                    "[Non-Blocking Error] LiteLLMDebugger: Missing LITELLM_TOKEN. Set it in your environment. Eg.: os.environ['LITELLM_TOKEN']= <your_email>"
-                )
-        except Exception:
-            raise ValueError(
-                "[Non-Blocking Error] LiteLLMDebugger: Missing LITELLM_TOKEN. Set it in your environment. Eg.: os.environ['LITELLM_TOKEN']= <your_email>"
-            )
-
-    def input_log_event(
-        self,
-        model,
-        messages,
-        end_user,
-        litellm_call_id,
-        call_type,
-        print_verbose,
-        litellm_params,
-        optional_params,
-    ):
-        """
-        This integration is not implemented yet.
-        """
-        return
-
-    def post_call_log_event(
-        self, original_response, litellm_call_id, print_verbose, call_type, stream
-    ):
-        """
-        This integration is not implemented yet.
-        """
-        return
-
-    def log_event(
-        self,
-        end_user,
-        response_obj,
-        start_time,
-        end_time,
-        litellm_call_id,
-        print_verbose,
-        call_type,
-        stream=False,
-    ):
-        """
-        This integration is not implemented yet.
-        """
-        return
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -171,7 +171,7 @@ class OpenTelemetry(CustomLogger):
                        try:
                            value = str(value)
                        except Exception:
-                            value = "litllm logging error - could_not_json_serialize"
+                            value = "litellm logging error - could_not_json_serialize"
                    self.safe_set_attribute(
                        span=service_logging_span,
                        key=key,
@ -396,9 +396,9 @@ class OpenTelemetry(CustomLogger):
    def set_attributes(self, span: Span, kwargs, response_obj):  # noqa: PLR0915
        try:
            if self.callback_name == "arize":
-                from litellm.integrations.arize_ai import set_arize_ai_attributes
+                from litellm.integrations.arize_ai import ArizeLogger

-                set_arize_ai_attributes(span, kwargs, response_obj)
+                ArizeLogger.set_arize_ai_attributes(span, kwargs, response_obj)
                return
            elif self.callback_name == "langtrace":
                from litellm.integrations.langtrace import LangtraceAttributes
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -6,7 +6,7 @@ import subprocess
 import sys
 import traceback
 import uuid
-from datetime import datetime, timedelta
+from datetime import date, datetime, timedelta
 from typing import Optional, TypedDict, Union

 import dotenv
@ -334,13 +334,8 @@ class PrometheusLogger(CustomLogger):
            print_verbose(f"Got exception on init prometheus client {str(e)}")
            raise e

-    async def async_log_success_event(  # noqa: PLR0915
-        self, kwargs, response_obj, start_time, end_time
-    ):
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        # Define prometheus client
-        from litellm.proxy.common_utils.callback_utils import (
-            get_model_group_from_litellm_kwargs,
-        )
        from litellm.types.utils import StandardLoggingPayload

        verbose_logger.debug(
@ -351,14 +346,19 @@ class PrometheusLogger(CustomLogger):
        standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
            "standard_logging_object"
        )
-        if standard_logging_payload is None:
-            raise ValueError("standard_logging_object is required")
+
+        if standard_logging_payload is None or not isinstance(
+            standard_logging_payload, dict
+        ):
+            raise ValueError(
+                f"standard_logging_object is required, got={standard_logging_payload}"
+            )
+
        model = kwargs.get("model", "")
        litellm_params = kwargs.get("litellm_params", {}) or {}
        _metadata = litellm_params.get("metadata", {})
        proxy_server_request = litellm_params.get("proxy_server_request") or {}
        end_user_id = proxy_server_request.get("body", {}).get("user", None)
-        model_parameters: dict = standard_logging_payload["model_parameters"]
        user_id = standard_logging_payload["metadata"]["user_api_key_user_id"]
        user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"]
        user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"]
@ -369,25 +369,6 @@ class PrometheusLogger(CustomLogger):
        output_tokens = standard_logging_payload["completion_tokens"]
        tokens_used = standard_logging_payload["total_tokens"]
        response_cost = standard_logging_payload["response_cost"]
-        _team_spend = litellm_params.get("metadata", {}).get(
-            "user_api_key_team_spend", None
-        )
-        _team_max_budget = litellm_params.get("metadata", {}).get(
-            "user_api_key_team_max_budget", None
-        )
-        _remaining_team_budget = safe_get_remaining_budget(
-            max_budget=_team_max_budget, spend=_team_spend
-        )
-
-        _api_key_spend = litellm_params.get("metadata", {}).get(
-            "user_api_key_spend", None
-        )
-        _api_key_max_budget = litellm_params.get("metadata", {}).get(
-            "user_api_key_max_budget", None
-        )
-        _remaining_api_key_budget = safe_get_remaining_budget(
-            max_budget=_api_key_max_budget, spend=_api_key_spend
-        )

        print_verbose(
            f"inside track_prometheus_metrics, model {model}, response_cost {response_cost}, tokens_used {tokens_used}, end_user_id {end_user_id}, user_api_key {user_api_key}"
@ -402,24 +383,82 @@ class PrometheusLogger(CustomLogger):

            user_api_key = hash_token(user_api_key)

-        self.litellm_requests_metric.labels(
-            end_user_id,
-            user_api_key,
-            user_api_key_alias,
-            model,
-            user_api_team,
-            user_api_team_alias,
-            user_id,
-        ).inc()
-        self.litellm_spend_metric.labels(
-            end_user_id,
-            user_api_key,
-            user_api_key_alias,
-            model,
-            user_api_team,
-            user_api_team_alias,
-            user_id,
-        ).inc(response_cost)
+        # increment total LLM requests and spend metric
+        self._increment_top_level_request_and_spend_metrics(
+            end_user_id=end_user_id,
+            user_api_key=user_api_key,
+            user_api_key_alias=user_api_key_alias,
+            model=model,
+            user_api_team=user_api_team,
+            user_api_team_alias=user_api_team_alias,
+            user_id=user_id,
+            response_cost=response_cost,
+        )
+
+        # input, output, total token metrics
+        self._increment_token_metrics(
+            # why type ignore below?
+            # 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains.
+            # 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal
+            standard_logging_payload=standard_logging_payload,  # type: ignore
+            end_user_id=end_user_id,
+            user_api_key=user_api_key,
+            user_api_key_alias=user_api_key_alias,
+            model=model,
+            user_api_team=user_api_team,
+            user_api_team_alias=user_api_team_alias,
+            user_id=user_id,
+        )
+
+        # remaining budget metrics
+        self._increment_remaining_budget_metrics(
+            user_api_team=user_api_team,
+            user_api_team_alias=user_api_team_alias,
+            user_api_key=user_api_key,
+            user_api_key_alias=user_api_key_alias,
+            litellm_params=litellm_params,
+        )
+
+        # set proxy virtual key rpm/tpm metrics
+        self._set_virtual_key_rate_limit_metrics(
+            user_api_key=user_api_key,
+            user_api_key_alias=user_api_key_alias,
+            kwargs=kwargs,
+            metadata=_metadata,
+        )
+
+        # set latency metrics
+        self._set_latency_metrics(
+            kwargs=kwargs,
+            model=model,
+            user_api_key=user_api_key,
+            user_api_key_alias=user_api_key_alias,
+            user_api_team=user_api_team,
+            user_api_team_alias=user_api_team_alias,
+            # why type ignore below?
+            # 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains.
+            # 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal
+            standard_logging_payload=standard_logging_payload,  # type: ignore
+        )
+
+        # set x-ratelimit headers
+        self.set_llm_deployment_success_metrics(
+            kwargs, start_time, end_time, output_tokens
+        )
+        pass
+
+    def _increment_token_metrics(
+        self,
+        standard_logging_payload: StandardLoggingPayload,
+        end_user_id: Optional[str],
+        user_api_key: Optional[str],
+        user_api_key_alias: Optional[str],
+        model: Optional[str],
+        user_api_team: Optional[str],
+        user_api_team_alias: Optional[str],
+        user_id: Optional[str],
+    ):
+        # token metrics
        self.litellm_tokens_metric.labels(
            end_user_id,
            user_api_key,
@ -450,6 +489,34 @@ class PrometheusLogger(CustomLogger):
            user_id,
        ).inc(standard_logging_payload["completion_tokens"])

+    def _increment_remaining_budget_metrics(
+        self,
+        user_api_team: Optional[str],
+        user_api_team_alias: Optional[str],
+        user_api_key: Optional[str],
+        user_api_key_alias: Optional[str],
+        litellm_params: dict,
+    ):
+        _team_spend = litellm_params.get("metadata", {}).get(
+            "user_api_key_team_spend", None
+        )
+        _team_max_budget = litellm_params.get("metadata", {}).get(
+            "user_api_key_team_max_budget", None
+        )
+        _remaining_team_budget = self._safe_get_remaining_budget(
+            max_budget=_team_max_budget, spend=_team_spend
+        )
+
+        _api_key_spend = litellm_params.get("metadata", {}).get(
+            "user_api_key_spend", None
+        )
+        _api_key_max_budget = litellm_params.get("metadata", {}).get(
+            "user_api_key_max_budget", None
+        )
+        _remaining_api_key_budget = self._safe_get_remaining_budget(
+            max_budget=_api_key_max_budget, spend=_api_key_spend
+        )
+        # Remaining Budget Metrics
        self.litellm_remaining_team_budget_metric.labels(
            user_api_team, user_api_team_alias
        ).set(_remaining_team_budget)
@ -458,6 +525,47 @@ class PrometheusLogger(CustomLogger):
            user_api_key, user_api_key_alias
        ).set(_remaining_api_key_budget)

+    def _increment_top_level_request_and_spend_metrics(
+        self,
+        end_user_id: Optional[str],
+        user_api_key: Optional[str],
+        user_api_key_alias: Optional[str],
+        model: Optional[str],
+        user_api_team: Optional[str],
+        user_api_team_alias: Optional[str],
+        user_id: Optional[str],
+        response_cost: float,
+    ):
+        self.litellm_requests_metric.labels(
+            end_user_id,
+            user_api_key,
+            user_api_key_alias,
+            model,
+            user_api_team,
+            user_api_team_alias,
+            user_id,
+        ).inc()
+        self.litellm_spend_metric.labels(
+            end_user_id,
+            user_api_key,
+            user_api_key_alias,
+            model,
+            user_api_team,
+            user_api_team_alias,
+            user_id,
+        ).inc(response_cost)
+
+    def _set_virtual_key_rate_limit_metrics(
+        self,
+        user_api_key: Optional[str],
+        user_api_key_alias: Optional[str],
+        kwargs: dict,
+        metadata: dict,
+    ):
+        from litellm.proxy.common_utils.callback_utils import (
+            get_model_group_from_litellm_kwargs,
+        )
+
        # Set remaining rpm/tpm for API Key + model
        # see parallel_request_limiter.py - variables are set there
        model_group = get_model_group_from_litellm_kwargs(kwargs)
@ -466,10 +574,8 @@ class PrometheusLogger(CustomLogger):
        )
        remaining_tokens_variable_name = f"litellm-key-remaining-tokens-{model_group}"

-        remaining_requests = _metadata.get(
-            remaining_requests_variable_name, sys.maxsize
-        )
-        remaining_tokens = _metadata.get(remaining_tokens_variable_name, sys.maxsize)
+        remaining_requests = metadata.get(remaining_requests_variable_name, sys.maxsize)
+        remaining_tokens = metadata.get(remaining_tokens_variable_name, sys.maxsize)

        self.litellm_remaining_api_key_requests_for_model.labels(
            user_api_key, user_api_key_alias, model_group
@ -479,9 +585,20 @@ class PrometheusLogger(CustomLogger):
            user_api_key, user_api_key_alias, model_group
        ).set(remaining_tokens)

+    def _set_latency_metrics(
+        self,
+        kwargs: dict,
+        model: Optional[str],
+        user_api_key: Optional[str],
+        user_api_key_alias: Optional[str],
+        user_api_team: Optional[str],
+        user_api_team_alias: Optional[str],
+        standard_logging_payload: StandardLoggingPayload,
+    ):
        # latency metrics
-        total_time: timedelta = kwargs.get("end_time") - kwargs.get("start_time")
-        total_time_seconds = total_time.total_seconds()
+        model_parameters: dict = standard_logging_payload["model_parameters"]
+        end_time: datetime = kwargs.get("end_time") or datetime.now()
+        start_time: Optional[datetime] = kwargs.get("start_time")
        api_call_start_time = kwargs.get("api_call_start_time", None)

        completion_start_time = kwargs.get("completion_start_time", None)
@ -509,9 +626,7 @@ class PrometheusLogger(CustomLogger):
        if api_call_start_time is not None and isinstance(
            api_call_start_time, datetime
        ):
-            api_call_total_time: timedelta = (
-                kwargs.get("end_time") - api_call_start_time
-            )
+            api_call_total_time: timedelta = end_time - api_call_start_time
            api_call_total_time_seconds = api_call_total_time.total_seconds()
            self.litellm_llm_api_latency_metric.labels(
                model,
@ -521,20 +636,17 @@ class PrometheusLogger(CustomLogger):
                user_api_team_alias,
            ).observe(api_call_total_time_seconds)

-        # log metrics
-        self.litellm_request_total_latency_metric.labels(
-            model,
-            user_api_key,
-            user_api_key_alias,
-            user_api_team,
-            user_api_team_alias,
-        ).observe(total_time_seconds)
-
-        # set x-ratelimit headers
-        self.set_llm_deployment_success_metrics(
-            kwargs, start_time, end_time, output_tokens
-        )
-        pass
+        # total request latency
+        if start_time is not None and isinstance(start_time, datetime):
+            total_time: timedelta = end_time - start_time
+            total_time_seconds = total_time.total_seconds()
+            self.litellm_request_total_latency_metric.labels(
+                model,
+                user_api_key,
+                user_api_key_alias,
+                user_api_team,
+                user_api_team_alias,
+            ).observe(total_time_seconds)

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
        from litellm.types.utils import StandardLoggingPayload
@ -651,24 +763,31 @@ class PrometheusLogger(CustomLogger):
            pass

    def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
+        """
+        Sets Failure metrics when an LLM API call fails
+
+        - mark the deployment as partial outage
+        - increment deployment failure responses metric
+        - increment deployment total requests metric
+
+        Args:
+            request_kwargs: dict
+
+        """
        try:
            verbose_logger.debug("setting remaining tokens requests metric")
            standard_logging_payload: StandardLoggingPayload = request_kwargs.get(
                "standard_logging_object", {}
            )
-            _response_headers = request_kwargs.get("response_headers")
            _litellm_params = request_kwargs.get("litellm_params", {}) or {}
-            _metadata = _litellm_params.get("metadata", {})
            litellm_model_name = request_kwargs.get("model", None)
-            api_base = _metadata.get("api_base", None)
-            model_group = _metadata.get("model_group", None)
-            if api_base is None:
-                api_base = _litellm_params.get("api_base", None)
-            llm_provider = _litellm_params.get("custom_llm_provider", None)
-            _model_info = _metadata.get("model_info") or {}
-            model_id = _model_info.get("id", None)
+            model_group = standard_logging_payload.get("model_group", None)
+            api_base = standard_logging_payload.get("api_base", None)
+            model_id = standard_logging_payload.get("model_id", None)
            exception: Exception = request_kwargs.get("exception", None)

+            llm_provider = _litellm_params.get("custom_llm_provider", None)
+
            """
            log these labels
            ["litellm_model_name", "model_id", "api_base", "api_provider"]
@ -891,7 +1010,7 @@ class PrometheusLogger(CustomLogger):
        """
        from litellm.litellm_core_utils.litellm_logging import (
            StandardLoggingMetadata,
-            get_standard_logging_metadata,
+            StandardLoggingPayloadSetup,
        )

        verbose_logger.debug(
@ -900,8 +1019,10 @@ class PrometheusLogger(CustomLogger):
            kwargs,
        )
        _metadata = kwargs.get("metadata", {})
-        standard_metadata: StandardLoggingMetadata = get_standard_logging_metadata(
-            metadata=_metadata
+        standard_metadata: StandardLoggingMetadata = (
+            StandardLoggingPayloadSetup.get_standard_logging_metadata(
+                metadata=_metadata
+            )
        )
        _new_model = kwargs.get("model")
        self.litellm_deployment_successful_fallbacks.labels(
@ -923,7 +1044,7 @@ class PrometheusLogger(CustomLogger):
        """
        from litellm.litellm_core_utils.litellm_logging import (
            StandardLoggingMetadata,
-            get_standard_logging_metadata,
+            StandardLoggingPayloadSetup,
        )

        verbose_logger.debug(
@ -933,8 +1054,10 @@ class PrometheusLogger(CustomLogger):
        )
        _new_model = kwargs.get("model")
        _metadata = kwargs.get("metadata", {})
-        standard_metadata: StandardLoggingMetadata = get_standard_logging_metadata(
-            metadata=_metadata
+        standard_metadata: StandardLoggingMetadata = (
+            StandardLoggingPayloadSetup.get_standard_logging_metadata(
+                metadata=_metadata
+            )
        )
        self.litellm_deployment_failed_fallbacks.labels(
            requested_model=original_model_group,
@ -951,8 +1074,8 @@ class PrometheusLogger(CustomLogger):
        self,
        state: int,
        litellm_model_name: str,
-        model_id: str,
-        api_base: str,
+        model_id: Optional[str],
+        api_base: Optional[str],
        api_provider: str,
    ):
        self.litellm_deployment_state.labels(
@ -973,8 +1096,8 @@ class PrometheusLogger(CustomLogger):
    def set_deployment_partial_outage(
        self,
        litellm_model_name: str,
-        model_id: str,
-        api_base: str,
+        model_id: Optional[str],
+        api_base: Optional[str],
        api_provider: str,
    ):
        self.set_litellm_deployment_state(
@ -984,8 +1107,8 @@ class PrometheusLogger(CustomLogger):
    def set_deployment_complete_outage(
        self,
        litellm_model_name: str,
-        model_id: str,
-        api_base: str,
+        model_id: Optional[str],
+        api_base: Optional[str],
        api_provider: str,
    ):
        self.set_litellm_deployment_state(
@ -1007,14 +1130,13 @@ class PrometheusLogger(CustomLogger):
            litellm_model_name, model_id, api_base, api_provider, exception_status
        ).inc()

+    def _safe_get_remaining_budget(
+        self, max_budget: Optional[float], spend: Optional[float]
+    ) -> float:
+        if max_budget is None:
+            return float("inf")

-def safe_get_remaining_budget(
-    max_budget: Optional[float], spend: Optional[float]
-) -> float:
-    if max_budget is None:
-        return float("inf")
+        if spend is None:
+            return max_budget

-    if spend is None:
-        return max_budget
-
-    return max_budget - spend
+        return max_budget - spend
--- a/litellm/litellm_core_utils/get_llm_provider_logic.py
+++ b/litellm/litellm_core_utils/get_llm_provider_logic.py
@ -333,6 +333,14 @@ def _get_openai_compatible_provider_info(  # noqa: PLR0915
    api_key: Optional[str],
    dynamic_api_key: Optional[str],
 ) -> Tuple[str, str, Optional[str], Optional[str]]:
+    """
+    Returns:
+        Tuple[str, str, Optional[str], Optional[str]]:
+            model: str
+            custom_llm_provider: str
+            dynamic_api_key: Optional[str]
+            api_base: Optional[str]
+    """
    custom_llm_provider = model.split("/", 1)[0]
    model = model.split("/", 1)[1]

--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -12,7 +12,7 @@ import time
 import traceback
 import uuid
 from datetime import datetime as dt_object
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

 from pydantic import BaseModel

@ -51,6 +51,7 @@ from litellm.types.utils import (
    StandardPassThroughResponseObject,
    TextCompletionResponse,
    TranscriptionResponse,
+    Usage,
 )
 from litellm.utils import (
    _get_base_model_from_metadata,
@ -58,22 +59,21 @@ from litellm.utils import (
    prompt_token_calculator,
 )

-from ..integrations.aispend import AISpendLogger
 from ..integrations.argilla import ArgillaLogger
+from ..integrations.arize_ai import ArizeLogger
 from ..integrations.athina import AthinaLogger
-from ..integrations.berrispend import BerriSpendLogger
 from ..integrations.braintrust_logging import BraintrustLogger
-from ..integrations.clickhouse import ClickhouseLogger
 from ..integrations.datadog.datadog import DataDogLogger
+from ..integrations.datadog.datadog_llm_obs import DataDogLLMObsLogger
 from ..integrations.dynamodb import DyanmoDBLogger
 from ..integrations.galileo import GalileoObserve
 from ..integrations.gcs_bucket.gcs_bucket import GCSBucketLogger
 from ..integrations.greenscale import GreenscaleLogger
 from ..integrations.helicone import HeliconeLogger
 from ..integrations.lago import LagoLogger
-from ..integrations.langfuse import LangFuseLogger
+from ..integrations.langfuse.langfuse import LangFuseLogger
+from ..integrations.langfuse.langfuse_handler import LangFuseHandler
 from ..integrations.langsmith import LangsmithLogger
-from ..integrations.litedebugger import LiteDebugger
 from ..integrations.literal_ai import LiteralAILogger
 from ..integrations.logfire_logger import LogfireLevel, LogfireLogger
 from ..integrations.lunary import LunaryLogger
@ -122,13 +122,9 @@ prometheusLogger = None
 dynamoLogger = None
 s3Logger = None
 genericAPILogger = None
-clickHouseLogger = None
 greenscaleLogger = None
 lunaryLogger = None
-aispendLogger = None
-berrispendLogger = None
 supabaseClient = None
-liteDebuggerClient = None
 callback_list: Optional[List[str]] = []
 user_logger_fn = None
 additional_details: Optional[Dict[str, str]] = {}
@ -191,7 +187,7 @@ in_memory_dynamic_logger_cache = DynamicLoggingCache()


 class Logging:
-    global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
+    global supabaseClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
    custom_pricing: bool = False
    stream_options = None

@ -970,22 +966,6 @@ class Logging:
                        ):
                            print_verbose("no-log request, skipping logging")
                            continue
-                    if callback == "lite_debugger" and liteDebuggerClient is not None:
-                        print_verbose("reaches lite_debugger for logging!")
-                        print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
-                        print_verbose(
-                            f"liteDebuggerClient details function {self.call_type} and stream set to {self.stream}"
-                        )
-                        liteDebuggerClient.log_event(
-                            end_user=kwargs.get("user", "default"),
-                            response_obj=result,
-                            start_time=start_time,
-                            end_time=end_time,
-                            litellm_call_id=self.litellm_call_id,
-                            print_verbose=print_verbose,
-                            call_type=self.call_type,
-                            stream=self.stream,
-                        )
                    if callback == "promptlayer" and promptLayerLogger is not None:
                        print_verbose("reaches promptlayer for logging!")
                        promptLayerLogger.log_event(
@ -1136,74 +1116,13 @@ class Logging:
                                print_verbose("reaches langfuse for streaming logging!")
                                result = kwargs["complete_streaming_response"]

-                        temp_langfuse_logger = langFuseLogger
-                        if langFuseLogger is None or (
-                            (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                != langFuseLogger.public_key
-                            )
-                            or (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_secret"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_secret"
-                                )
-                                != langFuseLogger.secret_key
-                            )
-                            or (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                )
-                                != langFuseLogger.langfuse_host
-                            )
-                        ):
-                            credentials = {
-                                "langfuse_public_key": self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                ),
-                                "langfuse_secret": self.standard_callback_dynamic_params.get(
-                                    "langfuse_secret"
-                                ),
-                                "langfuse_host": self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                ),
-                            }
-                            temp_langfuse_logger = (
-                                in_memory_dynamic_logger_cache.get_cache(
-                                    credentials=credentials, service_name="langfuse"
-                                )
-                            )
-                            if temp_langfuse_logger is None:
-                                temp_langfuse_logger = LangFuseLogger(
-                                    langfuse_public_key=self.standard_callback_dynamic_params.get(
-                                        "langfuse_public_key"
-                                    ),
-                                    langfuse_secret=self.standard_callback_dynamic_params.get(
-                                        "langfuse_secret"
-                                    ),
-                                    langfuse_host=self.standard_callback_dynamic_params.get(
-                                        "langfuse_host"
-                                    ),
-                                )
-                                in_memory_dynamic_logger_cache.set_cache(
-                                    credentials=credentials,
-                                    service_name="langfuse",
-                                    logging_obj=temp_langfuse_logger,
-                                )
-                        if temp_langfuse_logger is not None:
-                            _response = temp_langfuse_logger.log_event(
+                        langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request(
+                            globalLangfuseLogger=langFuseLogger,
+                            standard_callback_dynamic_params=self.standard_callback_dynamic_params,
+                            in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
+                        )
+                        if langfuse_logger_to_use is not None:
+                            _response = langfuse_logger_to_use.log_event(
                                kwargs=kwargs,
                                response_obj=result,
                                start_time=start_time,
@ -1248,37 +1167,6 @@ class Logging:
                            user_id=kwargs.get("user", None),
                            print_verbose=print_verbose,
                        )
-                    if callback == "clickhouse":
-                        global clickHouseLogger
-                        verbose_logger.debug("reaches clickhouse for success logging!")
-                        kwargs = {}
-                        for k, v in self.model_call_details.items():
-                            if (
-                                k != "original_response"
-                            ):  # copy.deepcopy raises errors as this could be a coroutine
-                                kwargs[k] = v
-                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                        if self.stream:
-                            verbose_logger.debug(
-                                f"is complete_streaming_response in kwargs: {kwargs.get('complete_streaming_response', None)}"
-                            )
-                            if complete_streaming_response is None:
-                                continue
-                            else:
-                                print_verbose(
-                                    "reaches clickhouse for streaming logging!"
-                                )
-                                result = kwargs["complete_streaming_response"]
-                        if clickHouseLogger is None:
-                            clickHouseLogger = ClickhouseLogger()
-                        clickHouseLogger.log_event(
-                            kwargs=kwargs,
-                            response_obj=result,
-                            start_time=start_time,
-                            end_time=end_time,
-                            user_id=kwargs.get("user", None),
-                            print_verbose=print_verbose,
-                        )
                    if callback == "greenscale" and greenscaleLogger is not None:
                        kwargs = {}
                        for k, v in self.model_call_details.items():
@ -1874,9 +1762,7 @@ class Logging:
            )
            for callback in callbacks:
                try:
-                    if callback == "lite_debugger" and liteDebuggerClient is not None:
-                        pass
-                    elif callback == "lunary" and lunaryLogger is not None:
+                    if callback == "lunary" and lunaryLogger is not None:
                        print_verbose("reaches lunary for logging error!")

                        model = self.model
@ -1962,50 +1848,12 @@ class Logging:
                            ):  # copy.deepcopy raises errors as this could be a coroutine
                                kwargs[k] = v
                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                        if langFuseLogger is None or (
-                            (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                != langFuseLogger.public_key
-                            )
-                            or (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                )
-                                != langFuseLogger.public_key
-                            )
-                            or (
-                                self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                )
-                                is not None
-                                and self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                )
-                                != langFuseLogger.langfuse_host
-                            )
-                        ):
-                            langFuseLogger = LangFuseLogger(
-                                langfuse_public_key=self.standard_callback_dynamic_params.get(
-                                    "langfuse_public_key"
-                                ),
-                                langfuse_secret=self.standard_callback_dynamic_params.get(
-                                    "langfuse_secret"
-                                ),
-                                langfuse_host=self.standard_callback_dynamic_params.get(
-                                    "langfuse_host"
-                                ),
-                            )
-                        _response = langFuseLogger.log_event(
+                        langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request(
+                            globalLangfuseLogger=langFuseLogger,
+                            standard_callback_dynamic_params=self.standard_callback_dynamic_params,
+                            in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
+                        )
+                        _response = langfuse_logger_to_use.log_event(
                            start_time=start_time,
                            end_time=end_time,
                            response_obj=None,
@ -2195,7 +2043,7 @@ def set_callbacks(callback_list, function_id=None):  # noqa: PLR0915
    """
    Globally sets the callback client
    """
-    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
+    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, supabaseClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger

    try:
        for callback in callback_list:
@ -2275,26 +2123,12 @@ def set_callbacks(callback_list, function_id=None):  # noqa: PLR0915
                weightsBiasesLogger = WeightsBiasesLogger()
            elif callback == "logfire":
                logfireLogger = LogfireLogger()
-            elif callback == "aispend":
-                aispendLogger = AISpendLogger()
-            elif callback == "berrispend":
-                berrispendLogger = BerriSpendLogger()
            elif callback == "supabase":
                print_verbose("instantiating supabase")
                supabaseClient = Supabase()
            elif callback == "greenscale":
                greenscaleLogger = GreenscaleLogger()
                print_verbose("Initialized Greenscale Logger")
-            elif callback == "lite_debugger":
-                print_verbose("instantiating lite_debugger")
-                if function_id:
-                    liteDebuggerClient = LiteDebugger(email=function_id)
-                elif litellm.token:
-                    liteDebuggerClient = LiteDebugger(email=litellm.token)
-                elif litellm.email:
-                    liteDebuggerClient = LiteDebugger(email=litellm.email)
-                else:
-                    liteDebuggerClient = LiteDebugger(email=str(uuid.uuid4()))
            elif callable(callback):
                customLogger = CustomLogger()
    except Exception as e:
@ -2372,6 +2206,10 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        _datadog_logger = DataDogLogger()
        _in_memory_loggers.append(_datadog_logger)
        return _datadog_logger  # type: ignore
+    elif logging_integration == "datadog_llm_observability":
+        _datadog_llm_obs_logger = DataDogLLMObsLogger()
+        _in_memory_loggers.append(_datadog_llm_obs_logger)
+        return _datadog_llm_obs_logger  # type: ignore
    elif logging_integration == "gcs_bucket":
        for callback in _in_memory_loggers:
            if isinstance(callback, GCSBucketLogger):
@ -2389,22 +2227,16 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        _in_memory_loggers.append(_opik_logger)
        return _opik_logger  # type: ignore
    elif logging_integration == "arize":
-        if "ARIZE_SPACE_KEY" not in os.environ:
-            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
-        if "ARIZE_API_KEY" not in os.environ:
-            raise ValueError("ARIZE_API_KEY not found in environment variables")
        from litellm.integrations.opentelemetry import (
            OpenTelemetry,
            OpenTelemetryConfig,
        )

-        arize_endpoint = (
-            os.environ.get("ARIZE_ENDPOINT", None) or "https://otlp.arize.com/v1"
-        )
-        otel_config = OpenTelemetryConfig(
-            exporter="otlp_grpc",
-            endpoint=arize_endpoint,
-        )
+        otel_config = ArizeLogger.get_arize_opentelemetry_config()
+        if otel_config is None:
+            raise ValueError(
+                "No valid endpoint found for Arize, please set 'ARIZE_ENDPOINT' to your GRPC endpoint or 'ARIZE_HTTP_ENDPOINT' to your HTTP endpoint"
+            )
        os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
            f"space_key={os.getenv('ARIZE_SPACE_KEY')},api_key={os.getenv('ARIZE_API_KEY')}"
        )
@ -2417,7 +2249,6 @@ def _init_custom_logger_compatible_class(  # noqa: PLR0915
        _otel_logger = OpenTelemetry(config=otel_config, callback_name="arize")
        _in_memory_loggers.append(_otel_logger)
        return _otel_logger  # type: ignore
-
    elif logging_integration == "otel":
        from litellm.integrations.opentelemetry import OpenTelemetry

@ -2546,6 +2377,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, DataDogLogger):
                return callback
+    elif logging_integration == "datadog_llm_observability":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, DataDogLLMObsLogger):
+                return callback
    elif logging_integration == "gcs_bucket":
        for callback in _in_memory_loggers:
            if isinstance(callback, GCSBucketLogger):
@ -2629,7 +2464,184 @@ def is_valid_sha256_hash(value: str) -> bool:
    return bool(re.fullmatch(r"[a-fA-F0-9]{64}", value))


-def get_standard_logging_object_payload(  # noqa: PLR0915
+class StandardLoggingPayloadSetup:
+    @staticmethod
+    def cleanup_timestamps(
+        start_time: Union[dt_object, float],
+        end_time: Union[dt_object, float],
+        completion_start_time: Union[dt_object, float],
+    ) -> Tuple[float, float, float]:
+        """
+        Convert datetime objects to floats
+        """
+
+        if isinstance(start_time, datetime.datetime):
+            start_time_float = start_time.timestamp()
+        elif isinstance(start_time, float):
+            start_time_float = start_time
+        else:
+            raise ValueError(
+                f"start_time is required, got={start_time} of type {type(start_time)}"
+            )
+
+        if isinstance(end_time, datetime.datetime):
+            end_time_float = end_time.timestamp()
+        elif isinstance(end_time, float):
+            end_time_float = end_time
+        else:
+            raise ValueError(
+                f"end_time is required, got={end_time} of type {type(end_time)}"
+            )
+
+        if isinstance(completion_start_time, datetime.datetime):
+            completion_start_time_float = completion_start_time.timestamp()
+        elif isinstance(completion_start_time, float):
+            completion_start_time_float = completion_start_time
+        else:
+            completion_start_time_float = end_time_float
+
+        return start_time_float, end_time_float, completion_start_time_float
+
+    @staticmethod
+    def get_standard_logging_metadata(
+        metadata: Optional[Dict[str, Any]]
+    ) -> StandardLoggingMetadata:
+        """
+        Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
+
+        Args:
+            metadata (Optional[Dict[str, Any]]): The original metadata dictionary.
+
+        Returns:
+            StandardLoggingMetadata: A StandardLoggingMetadata object containing the cleaned metadata.
+
+        Note:
+            - If the input metadata is None or not a dictionary, an empty StandardLoggingMetadata object is returned.
+            - If 'user_api_key' is present in metadata and is a valid SHA256 hash, it's stored as 'user_api_key_hash'.
+        """
+        # Initialize with default values
+        clean_metadata = StandardLoggingMetadata(
+            user_api_key_hash=None,
+            user_api_key_alias=None,
+            user_api_key_team_id=None,
+            user_api_key_org_id=None,
+            user_api_key_user_id=None,
+            user_api_key_team_alias=None,
+            spend_logs_metadata=None,
+            requester_ip_address=None,
+            requester_metadata=None,
+        )
+        if isinstance(metadata, dict):
+            # Filter the metadata dictionary to include only the specified keys
+            clean_metadata = StandardLoggingMetadata(
+                **{  # type: ignore
+                    key: metadata[key]
+                    for key in StandardLoggingMetadata.__annotations__.keys()
+                    if key in metadata
+                }
+            )
+
+            if metadata.get("user_api_key") is not None:
+                if is_valid_sha256_hash(str(metadata.get("user_api_key"))):
+                    clean_metadata["user_api_key_hash"] = metadata.get(
+                        "user_api_key"
+                    )  # this is the hash
+        return clean_metadata
+
+    @staticmethod
+    def get_usage_from_response_obj(response_obj: Optional[dict]) -> Usage:
+        ## BASE CASE ##
+        if response_obj is None:
+            return Usage(
+                prompt_tokens=0,
+                completion_tokens=0,
+                total_tokens=0,
+            )
+
+        usage = response_obj.get("usage", None) or {}
+        if usage is None or (
+            not isinstance(usage, dict) and not isinstance(usage, Usage)
+        ):
+            return Usage(
+                prompt_tokens=0,
+                completion_tokens=0,
+                total_tokens=0,
+            )
+        elif isinstance(usage, Usage):
+            return usage
+        elif isinstance(usage, dict):
+            return Usage(**usage)
+
+        raise ValueError(f"usage is required, got={usage} of type {type(usage)}")
+
+    @staticmethod
+    def get_model_cost_information(
+        base_model: Optional[str],
+        custom_pricing: Optional[bool],
+        custom_llm_provider: Optional[str],
+        init_response_obj: Union[Any, BaseModel, dict],
+    ) -> StandardLoggingModelInformation:
+
+        model_cost_name = _select_model_name_for_cost_calc(
+            model=None,
+            completion_response=init_response_obj,  # type: ignore
+            base_model=base_model,
+            custom_pricing=custom_pricing,
+        )
+        if model_cost_name is None:
+            model_cost_information = StandardLoggingModelInformation(
+                model_map_key="", model_map_value=None
+            )
+        else:
+            try:
+                _model_cost_information = litellm.get_model_info(
+                    model=model_cost_name, custom_llm_provider=custom_llm_provider
+                )
+                model_cost_information = StandardLoggingModelInformation(
+                    model_map_key=model_cost_name,
+                    model_map_value=_model_cost_information,
+                )
+            except Exception:
+                verbose_logger.debug(  # keep in debug otherwise it will trigger on every call
+                    "Model={} is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload".format(
+                        model_cost_name
+                    )
+                )
+                model_cost_information = StandardLoggingModelInformation(
+                    model_map_key=model_cost_name, model_map_value=None
+                )
+        return model_cost_information
+
+    @staticmethod
+    def get_final_response_obj(
+        response_obj: dict, init_response_obj: Union[Any, BaseModel, dict], kwargs: dict
+    ) -> Optional[Union[dict, str, list]]:
+        """
+        Get final response object after redacting the message input/output from logging
+        """
+        if response_obj is not None:
+            final_response_obj: Optional[Union[dict, str, list]] = response_obj
+        elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str):
+            final_response_obj = init_response_obj
+        else:
+            final_response_obj = None
+
+        modified_final_response_obj = redact_message_input_output_from_logging(
+            model_call_details=kwargs,
+            result=final_response_obj,
+        )
+
+        if modified_final_response_obj is not None and isinstance(
+            modified_final_response_obj, BaseModel
+        ):
+            final_response_obj = modified_final_response_obj.model_dump()
+        else:
+            final_response_obj = modified_final_response_obj
+
+        return final_response_obj
+
+
+def get_standard_logging_object_payload(
    kwargs: Optional[dict],
    init_response_obj: Union[Any, BaseModel, dict],
    start_time: dt_object,
@ -2677,9 +2689,9 @@ def get_standard_logging_object_payload(  # noqa: PLR0915
        completion_start_time = kwargs.get("completion_start_time", end_time)
        call_type = kwargs.get("call_type")
        cache_hit = kwargs.get("cache_hit", False)
-        usage = response_obj.get("usage", None) or {}
-        if type(usage) is litellm.Usage:
-            usage = dict(usage)
+        usage = StandardLoggingPayloadSetup.get_usage_from_response_obj(
+            response_obj=response_obj
+        )
        id = response_obj.get("id", kwargs.get("litellm_call_id"))

        _model_id = metadata.get("model_info", {}).get("id", "")
@ -2692,20 +2704,13 @@ def get_standard_logging_object_payload(  # noqa: PLR0915
        )

        # cleanup timestamps
-        if isinstance(start_time, datetime.datetime):
-            start_time_float = start_time.timestamp()
-        elif isinstance(start_time, float):
-            start_time_float = start_time
-        if isinstance(end_time, datetime.datetime):
-            end_time_float = end_time.timestamp()
-        elif isinstance(end_time, float):
-            end_time_float = end_time
-        if isinstance(completion_start_time, datetime.datetime):
-            completion_start_time_float = completion_start_time.timestamp()
-        elif isinstance(completion_start_time, float):
-            completion_start_time_float = completion_start_time
-        else:
-            completion_start_time_float = end_time_float
+        start_time_float, end_time_float, completion_start_time_float = (
+            StandardLoggingPayloadSetup.cleanup_timestamps(
+                start_time=start_time,
+                end_time=end_time,
+                completion_start_time=completion_start_time,
+            )
+        )
        # clean up litellm hidden params
        clean_hidden_params = StandardLoggingHiddenParams(
            model_id=None,
@ -2723,7 +2728,9 @@ def get_standard_logging_object_payload(  # noqa: PLR0915
                }
            )
        # clean up litellm metadata
-        clean_metadata = get_standard_logging_metadata(metadata=metadata)
+        clean_metadata = StandardLoggingPayloadSetup.get_standard_logging_metadata(
+            metadata=metadata
+        )

        if litellm.cache is not None:
            cache_key = litellm.cache.get_cache_key(**kwargs)
@ -2745,58 +2752,21 @@ def get_standard_logging_object_payload(  # noqa: PLR0915
        ## Get model cost information ##
        base_model = _get_base_model_from_metadata(model_call_details=kwargs)
        custom_pricing = use_custom_pricing_for_model(litellm_params=litellm_params)
-        model_cost_name = _select_model_name_for_cost_calc(
-            model=None,
-            completion_response=init_response_obj,  # type: ignore
+        model_cost_information = StandardLoggingPayloadSetup.get_model_cost_information(
            base_model=base_model,
            custom_pricing=custom_pricing,
+            custom_llm_provider=kwargs.get("custom_llm_provider"),
+            init_response_obj=init_response_obj,
        )
-        if model_cost_name is None:
-            model_cost_information = StandardLoggingModelInformation(
-                model_map_key="", model_map_value=None
-            )
-        else:
-            custom_llm_provider = kwargs.get("custom_llm_provider", None)
-
-            try:
-                _model_cost_information = litellm.get_model_info(
-                    model=model_cost_name, custom_llm_provider=custom_llm_provider
-                )
-                model_cost_information = StandardLoggingModelInformation(
-                    model_map_key=model_cost_name,
-                    model_map_value=_model_cost_information,
-                )
-            except Exception:
-                verbose_logger.debug(  # keep in debug otherwise it will trigger on every call
-                    "Model={} is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload".format(
-                        model_cost_name
-                    )
-                )
-                model_cost_information = StandardLoggingModelInformation(
-                    model_map_key=model_cost_name, model_map_value=None
-                )
-
        response_cost: float = kwargs.get("response_cost", 0) or 0.0

-        if response_obj is not None:
-            final_response_obj: Optional[Union[dict, str, list]] = response_obj
-        elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str):
-            final_response_obj = init_response_obj
-        else:
-            final_response_obj = None
-
-        modified_final_response_obj = redact_message_input_output_from_logging(
-            model_call_details=kwargs,
-            result=final_response_obj,
+        ## get final response object ##
+        final_response_obj = StandardLoggingPayloadSetup.get_final_response_obj(
+            response_obj=response_obj,
+            init_response_obj=init_response_obj,
+            kwargs=kwargs,
        )

-        if modified_final_response_obj is not None and isinstance(
-            modified_final_response_obj, BaseModel
-        ):
-            final_response_obj = modified_final_response_obj.model_dump()
-        else:
-            final_response_obj = modified_final_response_obj
-
        payload: StandardLoggingPayload = StandardLoggingPayload(
            id=str(id),
            call_type=call_type or "",
@ -2810,9 +2780,9 @@ def get_standard_logging_object_payload(  # noqa: PLR0915
            metadata=clean_metadata,
            cache_key=cache_key,
            response_cost=response_cost,
-            total_tokens=usage.get("total_tokens", 0),
-            prompt_tokens=usage.get("prompt_tokens", 0),
-            completion_tokens=usage.get("completion_tokens", 0),
+            total_tokens=usage.total_tokens,
+            prompt_tokens=usage.prompt_tokens,
+            completion_tokens=usage.completion_tokens,
            request_tags=request_tags,
            end_user=end_user_id or "",
            api_base=litellm_params.get("api_base", ""),
@ -2859,6 +2829,7 @@ def get_standard_logging_metadata(
        user_api_key_hash=None,
        user_api_key_alias=None,
        user_api_key_team_id=None,
+        user_api_key_org_id=None,
        user_api_key_user_id=None,
        user_api_key_team_alias=None,
        spend_logs_metadata=None,
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -0,0 +1,508 @@
+import asyncio
+import json
+import time
+import traceback
+import uuid
+from typing import Dict, Iterable, List, Literal, Optional, Union
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.types.utils import (
+    ChatCompletionDeltaToolCall,
+    ChatCompletionMessageToolCall,
+    Choices,
+    Delta,
+    EmbeddingResponse,
+    Function,
+    ImageResponse,
+    Message,
+    ModelResponse,
+    RerankResponse,
+    StreamingChoices,
+    TranscriptionResponse,
+    Usage,
+)
+
+from .get_headers import get_response_headers
+
+
+async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
+    """
+    Asynchronously converts a response object to a streaming response.
+
+    Args:
+        response_object (Optional[dict]): The response object to be converted. Defaults to None.
+
+    Raises:
+        Exception: If the response object is None.
+
+    Yields:
+        ModelResponse: The converted streaming response object.
+
+    Returns:
+        None
+    """
+    if response_object is None:
+        raise Exception("Error in response object format")
+
+    model_response_object = ModelResponse(stream=True)
+
+    if model_response_object is None:
+        raise Exception("Error in response creating model response object")
+
+    choice_list = []
+
+    for idx, choice in enumerate(response_object["choices"]):
+        if (
+            choice["message"].get("tool_calls", None) is not None
+            and isinstance(choice["message"]["tool_calls"], list)
+            and len(choice["message"]["tool_calls"]) > 0
+            and isinstance(choice["message"]["tool_calls"][0], dict)
+        ):
+            pydantic_tool_calls = []
+            for index, t in enumerate(choice["message"]["tool_calls"]):
+                if "index" not in t:
+                    t["index"] = index
+                pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
+            choice["message"]["tool_calls"] = pydantic_tool_calls
+        delta = Delta(
+            content=choice["message"].get("content", None),
+            role=choice["message"]["role"],
+            function_call=choice["message"].get("function_call", None),
+            tool_calls=choice["message"].get("tool_calls", None),
+        )
+        finish_reason = choice.get("finish_reason", None)
+
+        if finish_reason is None:
+            finish_reason = choice.get("finish_details")
+
+        logprobs = choice.get("logprobs", None)
+
+        choice = StreamingChoices(
+            finish_reason=finish_reason, index=idx, delta=delta, logprobs=logprobs
+        )
+        choice_list.append(choice)
+
+    model_response_object.choices = choice_list
+
+    if "usage" in response_object and response_object["usage"] is not None:
+        setattr(
+            model_response_object,
+            "usage",
+            Usage(
+                completion_tokens=response_object["usage"].get("completion_tokens", 0),
+                prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
+                total_tokens=response_object["usage"].get("total_tokens", 0),
+            ),
+        )
+
+    if "id" in response_object:
+        model_response_object.id = response_object["id"]
+
+    if "created" in response_object:
+        model_response_object.created = response_object["created"]
+
+    if "system_fingerprint" in response_object:
+        model_response_object.system_fingerprint = response_object["system_fingerprint"]
+
+    if "model" in response_object:
+        model_response_object.model = response_object["model"]
+
+    yield model_response_object
+    await asyncio.sleep(0)
+
+
+def convert_to_streaming_response(response_object: Optional[dict] = None):
+    # used for yielding Cache hits when stream == True
+    if response_object is None:
+        raise Exception("Error in response object format")
+
+    model_response_object = ModelResponse(stream=True)
+    choice_list = []
+    for idx, choice in enumerate(response_object["choices"]):
+        delta = Delta(
+            content=choice["message"].get("content", None),
+            role=choice["message"]["role"],
+            function_call=choice["message"].get("function_call", None),
+            tool_calls=choice["message"].get("tool_calls", None),
+        )
+        finish_reason = choice.get("finish_reason", None)
+        if finish_reason is None:
+            # gpt-4 vision can return 'finish_reason' or 'finish_details'
+            finish_reason = choice.get("finish_details")
+        logprobs = choice.get("logprobs", None)
+        enhancements = choice.get("enhancements", None)
+        choice = StreamingChoices(
+            finish_reason=finish_reason,
+            index=idx,
+            delta=delta,
+            logprobs=logprobs,
+            enhancements=enhancements,
+        )
+
+        choice_list.append(choice)
+    model_response_object.choices = choice_list
+
+    if "usage" in response_object and response_object["usage"] is not None:
+        setattr(model_response_object, "usage", Usage())
+        model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0)  # type: ignore
+        model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
+        model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
+
+    if "id" in response_object:
+        model_response_object.id = response_object["id"]
+
+    if "created" in response_object:
+        model_response_object.created = response_object["created"]
+
+    if "system_fingerprint" in response_object:
+        model_response_object.system_fingerprint = response_object["system_fingerprint"]
+
+    if "model" in response_object:
+        model_response_object.model = response_object["model"]
+    yield model_response_object
+
+
+from collections import defaultdict
+
+
+def _handle_invalid_parallel_tool_calls(
+    tool_calls: List[ChatCompletionMessageToolCall],
+):
+    """
+    Handle hallucinated parallel tool call from openai - https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
+
+    Code modified from: https://github.com/phdowling/openai_multi_tool_use_parallel_patch/blob/main/openai_multi_tool_use_parallel_patch.py
+    """
+
+    if tool_calls is None:
+        return
+    try:
+        replacements: Dict[int, List[ChatCompletionMessageToolCall]] = defaultdict(list)
+        for i, tool_call in enumerate(tool_calls):
+            current_function = tool_call.function.name
+            function_args = json.loads(tool_call.function.arguments)
+            if current_function == "multi_tool_use.parallel":
+                verbose_logger.debug(
+                    "OpenAI did a weird pseudo-multi-tool-use call, fixing call structure.."
+                )
+                for _fake_i, _fake_tool_use in enumerate(function_args["tool_uses"]):
+                    _function_args = _fake_tool_use["parameters"]
+                    _current_function = _fake_tool_use["recipient_name"]
+                    if _current_function.startswith("functions."):
+                        _current_function = _current_function[len("functions.") :]
+
+                    fixed_tc = ChatCompletionMessageToolCall(
+                        id=f"{tool_call.id}_{_fake_i}",
+                        type="function",
+                        function=Function(
+                            name=_current_function, arguments=json.dumps(_function_args)
+                        ),
+                    )
+                    replacements[i].append(fixed_tc)
+
+        shift = 0
+        for i, replacement in replacements.items():
+            tool_calls[:] = (
+                tool_calls[: i + shift] + replacement + tool_calls[i + shift + 1 :]
+            )
+            shift += len(replacement)
+
+        return tool_calls
+    except json.JSONDecodeError:
+        # if there is a JSONDecodeError, return the original tool_calls
+        return tool_calls
+
+
+class LiteLLMResponseObjectHandler:
+
+    @staticmethod
+    def convert_to_image_response(
+        response_object: dict,
+        model_response_object: Optional[ImageResponse] = None,
+        hidden_params: Optional[dict] = None,
+    ) -> ImageResponse:
+
+        response_object.update({"hidden_params": hidden_params})
+
+        if model_response_object is None:
+            model_response_object = ImageResponse(**response_object)
+            return model_response_object
+        else:
+            model_response_dict = model_response_object.model_dump()
+
+            model_response_dict.update(response_object)
+            model_response_object = ImageResponse(**model_response_dict)
+            return model_response_object
+
+
+def convert_to_model_response_object(  # noqa: PLR0915
+    response_object: Optional[dict] = None,
+    model_response_object: Optional[
+        Union[
+            ModelResponse,
+            EmbeddingResponse,
+            ImageResponse,
+            TranscriptionResponse,
+            RerankResponse,
+        ]
+    ] = None,
+    response_type: Literal[
+        "completion", "embedding", "image_generation", "audio_transcription", "rerank"
+    ] = "completion",
+    stream=False,
+    start_time=None,
+    end_time=None,
+    hidden_params: Optional[dict] = None,
+    _response_headers: Optional[dict] = None,
+    convert_tool_call_to_json_mode: Optional[
+        bool
+    ] = None,  # used for supporting 'json_schema' on older models
+):
+    received_args = locals()
+
+    additional_headers = get_response_headers(_response_headers)
+
+    if hidden_params is None:
+        hidden_params = {}
+    hidden_params["additional_headers"] = additional_headers
+
+    ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
+    if (
+        response_object is not None
+        and "error" in response_object
+        and response_object["error"] is not None
+    ):
+        error_args = {"status_code": 422, "message": "Error in response object"}
+        if isinstance(response_object["error"], dict):
+            if "code" in response_object["error"]:
+                error_args["status_code"] = response_object["error"]["code"]
+            if "message" in response_object["error"]:
+                if isinstance(response_object["error"]["message"], dict):
+                    message_str = json.dumps(response_object["error"]["message"])
+                else:
+                    message_str = str(response_object["error"]["message"])
+                error_args["message"] = message_str
+        raised_exception = Exception()
+        setattr(raised_exception, "status_code", error_args["status_code"])
+        setattr(raised_exception, "message", error_args["message"])
+        raise raised_exception
+
+    try:
+        if response_type == "completion" and (
+            model_response_object is None
+            or isinstance(model_response_object, ModelResponse)
+        ):
+            if response_object is None or model_response_object is None:
+                raise Exception("Error in response object format")
+            if stream is True:
+                # for returning cached responses, we need to yield a generator
+                return convert_to_streaming_response(response_object=response_object)
+            choice_list = []
+
+            assert response_object["choices"] is not None and isinstance(
+                response_object["choices"], Iterable
+            )
+
+            for idx, choice in enumerate(response_object["choices"]):
+                ## HANDLE JSON MODE - anthropic returns single function call]
+                tool_calls = choice["message"].get("tool_calls", None)
+                if tool_calls is not None:
+                    _openai_tool_calls = []
+                    for _tc in tool_calls:
+                        _openai_tc = ChatCompletionMessageToolCall(**_tc)
+                        _openai_tool_calls.append(_openai_tc)
+                    fixed_tool_calls = _handle_invalid_parallel_tool_calls(
+                        _openai_tool_calls
+                    )
+
+                    if fixed_tool_calls is not None:
+                        tool_calls = fixed_tool_calls
+
+                message: Optional[Message] = None
+                finish_reason: Optional[str] = None
+                if (
+                    convert_tool_call_to_json_mode
+                    and tool_calls is not None
+                    and len(tool_calls) == 1
+                ):
+                    # to support 'json_schema' logic on older models
+                    json_mode_content_str: Optional[str] = tool_calls[0][
+                        "function"
+                    ].get("arguments")
+                    if json_mode_content_str is not None:
+                        message = litellm.Message(content=json_mode_content_str)
+                        finish_reason = "stop"
+                if message is None:
+                    message = Message(
+                        content=choice["message"].get("content", None),
+                        role=choice["message"]["role"] or "assistant",
+                        function_call=choice["message"].get("function_call", None),
+                        tool_calls=tool_calls,
+                        audio=choice["message"].get("audio", None),
+                    )
+                    finish_reason = choice.get("finish_reason", None)
+                if finish_reason is None:
+                    # gpt-4 vision can return 'finish_reason' or 'finish_details'
+                    finish_reason = choice.get("finish_details") or "stop"
+                logprobs = choice.get("logprobs", None)
+                enhancements = choice.get("enhancements", None)
+                choice = Choices(
+                    finish_reason=finish_reason,
+                    index=idx,
+                    message=message,
+                    logprobs=logprobs,
+                    enhancements=enhancements,
+                )
+                choice_list.append(choice)
+            model_response_object.choices = choice_list
+
+            if "usage" in response_object and response_object["usage"] is not None:
+                usage_object = litellm.Usage(**response_object["usage"])
+                setattr(model_response_object, "usage", usage_object)
+            if "created" in response_object:
+                model_response_object.created = response_object["created"] or int(
+                    time.time()
+                )
+
+            if "id" in response_object:
+                model_response_object.id = response_object["id"] or str(uuid.uuid4())
+
+            if "system_fingerprint" in response_object:
+                model_response_object.system_fingerprint = response_object[
+                    "system_fingerprint"
+                ]
+
+            if "model" in response_object:
+                if model_response_object.model is None:
+                    model_response_object.model = response_object["model"]
+                elif (
+                    "/" in model_response_object.model
+                    and response_object["model"] is not None
+                ):
+                    openai_compatible_provider = model_response_object.model.split("/")[
+                        0
+                    ]
+                    model_response_object.model = (
+                        openai_compatible_provider + "/" + response_object["model"]
+                    )
+
+            if start_time is not None and end_time is not None:
+                if isinstance(start_time, type(end_time)):
+                    model_response_object._response_ms = (  # type: ignore
+                        end_time - start_time
+                    ).total_seconds() * 1000
+
+            if hidden_params is not None:
+                if model_response_object._hidden_params is None:
+                    model_response_object._hidden_params = {}
+                model_response_object._hidden_params.update(hidden_params)
+
+            if _response_headers is not None:
+                model_response_object._response_headers = _response_headers
+
+            special_keys = list(litellm.ModelResponse.model_fields.keys())
+            special_keys.append("usage")
+            for k, v in response_object.items():
+                if k not in special_keys:
+                    setattr(model_response_object, k, v)
+
+            return model_response_object
+        elif response_type == "embedding" and (
+            model_response_object is None
+            or isinstance(model_response_object, EmbeddingResponse)
+        ):
+            if response_object is None:
+                raise Exception("Error in response object format")
+
+            if model_response_object is None:
+                model_response_object = EmbeddingResponse()
+
+            if "model" in response_object:
+                model_response_object.model = response_object["model"]
+
+            if "object" in response_object:
+                model_response_object.object = response_object["object"]
+
+            model_response_object.data = response_object["data"]
+
+            if "usage" in response_object and response_object["usage"] is not None:
+                model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0)  # type: ignore
+                model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
+                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
+
+            if start_time is not None and end_time is not None:
+                model_response_object._response_ms = (  # type: ignore
+                    end_time - start_time
+                ).total_seconds() * 1000  # return response latency in ms like openai
+
+            if hidden_params is not None:
+                model_response_object._hidden_params = hidden_params
+
+            if _response_headers is not None:
+                model_response_object._response_headers = _response_headers
+
+            return model_response_object
+        elif response_type == "image_generation" and (
+            model_response_object is None
+            or isinstance(model_response_object, ImageResponse)
+        ):
+            if response_object is None:
+                raise Exception("Error in response object format")
+
+            return LiteLLMResponseObjectHandler.convert_to_image_response(
+                response_object=response_object,
+                model_response_object=model_response_object,
+                hidden_params=hidden_params,
+            )
+
+        elif response_type == "audio_transcription" and (
+            model_response_object is None
+            or isinstance(model_response_object, TranscriptionResponse)
+        ):
+            if response_object is None:
+                raise Exception("Error in response object format")
+
+            if model_response_object is None:
+                model_response_object = TranscriptionResponse()
+
+            if "text" in response_object:
+                model_response_object.text = response_object["text"]
+
+            optional_keys = ["language", "task", "duration", "words", "segments"]
+            for key in optional_keys:  # not guaranteed to be in response
+                if key in response_object:
+                    setattr(model_response_object, key, response_object[key])
+
+            if hidden_params is not None:
+                model_response_object._hidden_params = hidden_params
+
+            if _response_headers is not None:
+                model_response_object._response_headers = _response_headers
+
+            return model_response_object
+        elif response_type == "rerank" and (
+            model_response_object is None
+            or isinstance(model_response_object, RerankResponse)
+        ):
+            if response_object is None:
+                raise Exception("Error in response object format")
+
+            if model_response_object is None:
+                model_response_object = RerankResponse(**response_object)
+                return model_response_object
+
+            if "id" in response_object:
+                model_response_object.id = response_object["id"]
+
+            if "meta" in response_object:
+                model_response_object.meta = response_object["meta"]
+
+            if "results" in response_object:
+                model_response_object.results = response_object["results"]
+
+            return model_response_object
+    except Exception:
+        raise Exception(
+            f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
+        )
--- a/litellm/litellm_core_utils/llm_response_utils/get_headers.py
+++ b/litellm/litellm_core_utils/llm_response_utils/get_headers.py
@ -15,27 +15,28 @@ def get_response_headers(_response_headers: Optional[dict] = None) -> dict:
        dict: _response_headers with OpenAI headers and llm_provider-{header}

    """
-    if _response_headers is not None:
-        openai_headers = {}
-        if "x-ratelimit-limit-requests" in _response_headers:
-            openai_headers["x-ratelimit-limit-requests"] = _response_headers[
-                "x-ratelimit-limit-requests"
-            ]
-        if "x-ratelimit-remaining-requests" in _response_headers:
-            openai_headers["x-ratelimit-remaining-requests"] = _response_headers[
-                "x-ratelimit-remaining-requests"
-            ]
-        if "x-ratelimit-limit-tokens" in _response_headers:
-            openai_headers["x-ratelimit-limit-tokens"] = _response_headers[
-                "x-ratelimit-limit-tokens"
-            ]
-        if "x-ratelimit-remaining-tokens" in _response_headers:
-            openai_headers["x-ratelimit-remaining-tokens"] = _response_headers[
-                "x-ratelimit-remaining-tokens"
-            ]
-        llm_provider_headers = _get_llm_provider_headers(_response_headers)
-        return {**llm_provider_headers, **openai_headers}
-    return {}
+    if _response_headers is None:
+        return {}
+
+    openai_headers = {}
+    if "x-ratelimit-limit-requests" in _response_headers:
+        openai_headers["x-ratelimit-limit-requests"] = _response_headers[
+            "x-ratelimit-limit-requests"
+        ]
+    if "x-ratelimit-remaining-requests" in _response_headers:
+        openai_headers["x-ratelimit-remaining-requests"] = _response_headers[
+            "x-ratelimit-remaining-requests"
+        ]
+    if "x-ratelimit-limit-tokens" in _response_headers:
+        openai_headers["x-ratelimit-limit-tokens"] = _response_headers[
+            "x-ratelimit-limit-tokens"
+        ]
+    if "x-ratelimit-remaining-tokens" in _response_headers:
+        openai_headers["x-ratelimit-remaining-tokens"] = _response_headers[
+            "x-ratelimit-remaining-tokens"
+        ]
+    llm_provider_headers = _get_llm_provider_headers(_response_headers)
+    return {**llm_provider_headers, **openai_headers}


 def _get_llm_provider_headers(response_headers: dict) -> dict:
--- a/litellm/litellm_core_utils/realtime_streaming.py
+++ b/litellm/litellm_core_utils/realtime_streaming.py
@ -26,15 +26,24 @@ async with websockets.connect(  # type: ignore

 import asyncio
 import concurrent.futures
+import json
 import traceback
 from asyncio import Task
 from typing import Any, Dict, List, Optional, Union

+import litellm
+
 from .litellm_logging import Logging as LiteLLMLogging

 # Create a thread pool with a maximum of 10 threads
 executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)

+DefaultLoggedRealTimeEventTypes = [
+    "session.created",
+    "response.create",
+    "response.done",
+]
+

 class RealTimeStreaming:
    def __init__(
@ -49,9 +58,27 @@ class RealTimeStreaming:
        self.messages: List = []
        self.input_message: Dict = {}

+        _logged_real_time_event_types = litellm.logged_real_time_event_types
+
+        if _logged_real_time_event_types is None:
+            _logged_real_time_event_types = DefaultLoggedRealTimeEventTypes
+        self.logged_real_time_event_types = _logged_real_time_event_types
+
+    def _should_store_message(self, message: Union[str, bytes]) -> bool:
+        if isinstance(message, bytes):
+            message = message.decode("utf-8")
+        message_obj = json.loads(message)
+        _msg_type = message_obj["type"]
+        if self.logged_real_time_event_types == "*":
+            return True
+        if _msg_type in self.logged_real_time_event_types:
+            return True
+        return False
+
    def store_message(self, message: Union[str, bytes]):
        """Store message in list"""
-        self.messages.append(message)
+        if self._should_store_message(message):
+            self.messages.append(message)

    def store_input(self, message: dict):
        """Store input message"""
--- a/litellm/llms/AzureOpenAI/chat/gpt_transformation.py
+++ b/litellm/llms/AzureOpenAI/chat/gpt_transformation.py
@ -198,9 +198,6 @@ class AzureOpenAIConfig:
                    optional_params["json_mode"] = True
                else:
                    optional_params["response_format"] = value
-            elif param == "max_completion_tokens":
-                # TODO - Azure OpenAI will probably add support for this, we should pass it through when Azure adds support
-                optional_params["max_tokens"] = value
            elif param in supported_openai_params:
                optional_params[param] = value

--- a/litellm/llms/AzureOpenAI/realtime/handler.py
+++ b/litellm/llms/AzureOpenAI/realtime/handler.py
@ -72,5 +72,5 @@ class AzureOpenAIRealtime(AzureChatCompletion):

        except websockets.exceptions.InvalidStatusCode as e:  # type: ignore
            await websocket.close(code=e.status_code, reason=str(e))
-        except Exception as e:
-            await websocket.close(code=1011, reason=f"Internal server error: {str(e)}")
+        except Exception:
+            pass
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@ -1349,7 +1349,7 @@ class OpenAIChatCompletion(BaseLLM):
            if aimg_generation is True:
                return self.aimage_generation(data=data, prompt=prompt, logging_obj=logging_obj, model_response=model_response, api_base=api_base, api_key=api_key, timeout=timeout, client=client, max_retries=max_retries)  # type: ignore

-            openai_client = self._get_openai_client(
+            openai_client: OpenAI = self._get_openai_client(  # type: ignore
                is_async=False,
                api_key=api_key,
                api_base=api_base,
@ -1371,8 +1371,9 @@ class OpenAIChatCompletion(BaseLLM):
            )

            ## COMPLETION CALL
-            response = openai_client.images.generate(**data, timeout=timeout)  # type: ignore
-            response = response.model_dump()  # type: ignore
+            _response = openai_client.images.generate(**data, timeout=timeout)  # type: ignore
+
+            response = _response.model_dump()
            ## LOGGING
            logging_obj.post_call(
                input=prompt,
@ -1380,7 +1381,6 @@ class OpenAIChatCompletion(BaseLLM):
                additional_args={"complete_input_dict": data},
                original_response=response,
            )
-            # return response
            return convert_to_model_response_object(response_object=response, model_response_object=model_response, response_type="image_generation")  # type: ignore
        except OpenAIError as e:

--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -398,6 +398,8 @@ class AnthropicChatCompletion(BaseLLM):
            error_response = getattr(e, "response", None)
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
+            if error_response and hasattr(error_response, "text"):
+                error_text = getattr(error_response, "text", error_text)
            raise AnthropicError(
                message=error_text,
                status_code=status_code,
--- a/litellm/llms/azure_ai/embed/handler.py
+++ b/litellm/llms/azure_ai/embed/handler.py
@ -9,7 +9,7 @@ import httpx
 from openai import OpenAI

 import litellm
-from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
--- a/litellm/llms/bedrock/chat/converse_handler.py
+++ b/litellm/llms/bedrock/chat/converse_handler.py
@ -19,6 +19,7 @@ from ..common_utils import BedrockError
 from .invoke_handler import AWSEventStreamDecoder, MockResponseIterator, make_call

 BEDROCK_CONVERSE_MODELS = [
+    "anthropic.claude-3-5-sonnet-20241022-v2:0",
    "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "anthropic.claude-3-opus-20240229-v1:0",
    "anthropic.claude-3-sonnet-20240229-v1:0",
--- a/litellm/llms/bedrock/embed/cohere_transformation.py
+++ b/litellm/llms/bedrock/embed/cohere_transformation.py
@ -7,6 +7,7 @@ Why separate file? Make it easy to see how transformation works
 from typing import List

 import litellm
+from litellm.llms.cohere.embed.transformation import CohereEmbeddingConfig
 from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingResponse
 from litellm.types.utils import Embedding, EmbeddingResponse

@ -26,15 +27,21 @@ class BedrockCohereEmbeddingConfig:
                optional_params["embedding_types"] = v
        return optional_params

+    def _is_v3_model(self, model: str) -> bool:
+        return "3" in model
+
    def _transform_request(
-        self, input: List[str], inference_params: dict
+        self, model: str, input: List[str], inference_params: dict
    ) -> CohereEmbeddingRequest:
-        transformed_request = CohereEmbeddingRequest(
-            texts=input,
-            input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,  # type: ignore
+        transformed_request = CohereEmbeddingConfig()._transform_request(
+            model, input, inference_params
        )

-        for k, v in inference_params.items():
-            transformed_request[k] = v  # type: ignore
+        new_transformed_request = CohereEmbeddingRequest(
+            input_type=transformed_request["input_type"],
+        )
+        for k in CohereEmbeddingRequest.__annotations__.keys():
+            if k in transformed_request:
+                new_transformed_request[k] = transformed_request[k]  # type: ignore

-        return transformed_request
+        return new_transformed_request
--- a/litellm/llms/bedrock/embed/embedding.py
+++ b/litellm/llms/bedrock/embed/embedding.py
@ -11,7 +11,7 @@ from typing import Any, Callable, List, Literal, Optional, Tuple, Union
 import httpx

 import litellm
-from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
@ -369,7 +369,7 @@ class BedrockEmbedding(BaseAWSLLM):
        batch_data: Optional[List] = None
        if provider == "cohere":
            data = BedrockCohereEmbeddingConfig()._transform_request(
-                input=input, inference_params=inference_params
+                model=model, input=input, inference_params=inference_params
            )
        elif provider == "amazon" and model in [
            "amazon.titan-embed-image-v1",
--- a/litellm/llms/cohere/embed/handler.py
+++ b/litellm/llms/cohere/embed/handler.py
@ -12,8 +12,11 @@ import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.bedrock import CohereEmbeddingRequest
 from litellm.utils import Choices, Message, ModelResponse, Usage

+from .transformation import CohereEmbeddingConfig
+

 def validate_environment(api_key, headers: dict):
    headers.update(
@ -41,39 +44,9 @@ class CohereError(Exception):
        )  # Call the base class constructor with the parameters it needs


-def _process_embedding_response(
-    embeddings: list,
-    model_response: litellm.EmbeddingResponse,
-    model: str,
-    encoding: Any,
-    input: list,
-) -> litellm.EmbeddingResponse:
-    output_data = []
-    for idx, embedding in enumerate(embeddings):
-        output_data.append(
-            {"object": "embedding", "index": idx, "embedding": embedding}
-        )
-    model_response.object = "list"
-    model_response.data = output_data
-    model_response.model = model
-    input_tokens = 0
-    for text in input:
-        input_tokens += len(encoding.encode(text))
-
-    setattr(
-        model_response,
-        "usage",
-        Usage(
-            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
-        ),
-    )
-
-    return model_response
-
-
 async def async_embedding(
    model: str,
-    data: dict,
+    data: Union[dict, CohereEmbeddingRequest],
    input: list,
    model_response: litellm.utils.EmbeddingResponse,
    timeout: Optional[Union[float, httpx.Timeout]],
@ -121,19 +94,12 @@ async def async_embedding(
        )
        raise e

-    ## LOGGING
-    logging_obj.post_call(
-        input=input,
-        api_key=api_key,
-        additional_args={"complete_input_dict": data},
-        original_response=response.text,
-    )
-
-    embeddings = response.json()["embeddings"]
-
    ## PROCESS RESPONSE ##
-    return _process_embedding_response(
-        embeddings=embeddings,
+    return CohereEmbeddingConfig()._transform_response(
+        response=response,
+        api_key=api_key,
+        logging_obj=logging_obj,
+        data=data,
        model_response=model_response,
        model=model,
        encoding=encoding,
@ -149,7 +115,7 @@ def embedding(
    optional_params: dict,
    headers: dict,
    encoding: Any,
-    data: Optional[dict] = None,
+    data: Optional[Union[dict, CohereEmbeddingRequest]] = None,
    complete_api_base: Optional[str] = None,
    api_key: Optional[str] = None,
    aembedding: Optional[bool] = None,
@ -159,11 +125,10 @@ def embedding(
    headers = validate_environment(api_key, headers=headers)
    embed_url = complete_api_base or "https://api.cohere.ai/v1/embed"
    model = model
-    data = data or {"model": model, "texts": input, **optional_params}

-    if "3" in model and "input_type" not in data:
-        # cohere v3 embedding models require input_type, if no input_type is provided, default to "search_document"
-        data["input_type"] = "search_document"
+    data = data or CohereEmbeddingConfig()._transform_request(
+        model=model, input=input, inference_params=optional_params
+    )

    ## ROUTING
    if aembedding is True:
@ -193,30 +158,12 @@ def embedding(
        client = HTTPHandler(concurrent_limit=1)

    response = client.post(embed_url, headers=headers, data=json.dumps(data))
-    ## LOGGING
-    logging_obj.post_call(
-        input=input,
-        api_key=api_key,
-        additional_args={"complete_input_dict": data},
-        original_response=response,
-    )
-    """
-        response 
-        {
-            'object': "list",
-            'data': [
-            
-            ]
-            'model', 
-            'usage'
-        }
-    """
-    if response.status_code != 200:
-        raise CohereError(message=response.text, status_code=response.status_code)
-    embeddings = response.json()["embeddings"]

-    return _process_embedding_response(
-        embeddings=embeddings,
+    return CohereEmbeddingConfig()._transform_response(
+        response=response,
+        api_key=api_key,
+        logging_obj=logging_obj,
+        data=data,
        model_response=model_response,
        model=model,
        encoding=encoding,
--- a/litellm/llms/cohere/embed/transformation.py
+++ b/litellm/llms/cohere/embed/transformation.py
@ -0,0 +1,160 @@
+"""
+Transformation logic from OpenAI /v1/embeddings format to Cohere's /v1/embed format.
+
+Why separate file? Make it easy to see how transformation works
+
+Convers
+- v3 embedding models
+- v2 embedding models
+
+Docs - https://docs.cohere.com/v2/reference/embed
+"""
+
+import types
+from typing import Any, List, Optional, Union
+
+import httpx
+
+from litellm import COHERE_DEFAULT_EMBEDDING_INPUT_TYPE
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.types.llms.bedrock import (
+    COHERE_EMBEDDING_INPUT_TYPES,
+    CohereEmbeddingRequest,
+    CohereEmbeddingRequestWithModel,
+)
+from litellm.types.utils import (
+    Embedding,
+    EmbeddingResponse,
+    PromptTokensDetailsWrapper,
+    Usage,
+)
+from litellm.utils import is_base64_encoded
+
+
+class CohereEmbeddingConfig:
+    """
+    Reference: https://docs.cohere.com/v2/reference/embed
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def get_supported_openai_params(self) -> List[str]:
+        return ["encoding_format"]
+
+    def map_openai_params(
+        self, non_default_params: dict, optional_params: dict
+    ) -> dict:
+        for k, v in non_default_params.items():
+            if k == "encoding_format":
+                optional_params["embedding_types"] = v
+        return optional_params
+
+    def _is_v3_model(self, model: str) -> bool:
+        return "3" in model
+
+    def _transform_request(
+        self, model: str, input: List[str], inference_params: dict
+    ) -> CohereEmbeddingRequestWithModel:
+        is_encoded = False
+        for input_str in input:
+            is_encoded = is_base64_encoded(input_str)
+
+        if is_encoded:  # check if string is b64 encoded image or not
+            transformed_request = CohereEmbeddingRequestWithModel(
+                model=model,
+                images=input,
+                input_type="image",
+            )
+        else:
+            transformed_request = CohereEmbeddingRequestWithModel(
+                model=model,
+                texts=input,
+                input_type=COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,
+            )
+
+        for k, v in inference_params.items():
+            transformed_request[k] = v  # type: ignore
+
+        return transformed_request
+
+    def _calculate_usage(self, input: List[str], encoding: Any, meta: dict) -> Usage:
+
+        input_tokens = 0
+
+        text_tokens: Optional[int] = meta.get("billed_units", {}).get("input_tokens")
+
+        image_tokens: Optional[int] = meta.get("billed_units", {}).get("images")
+
+        prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
+        if image_tokens is None and text_tokens is None:
+            for text in input:
+                input_tokens += len(encoding.encode(text))
+        else:
+            prompt_tokens_details = PromptTokensDetailsWrapper(
+                image_tokens=image_tokens,
+                text_tokens=text_tokens,
+            )
+            if image_tokens:
+                input_tokens += image_tokens
+            if text_tokens:
+                input_tokens += text_tokens
+
+        return Usage(
+            prompt_tokens=input_tokens,
+            completion_tokens=0,
+            total_tokens=input_tokens,
+            prompt_tokens_details=prompt_tokens_details,
+        )
+
+    def _transform_response(
+        self,
+        response: httpx.Response,
+        api_key: Optional[str],
+        logging_obj: LiteLLMLoggingObj,
+        data: Union[dict, CohereEmbeddingRequest],
+        model_response: EmbeddingResponse,
+        model: str,
+        encoding: Any,
+        input: list,
+    ) -> EmbeddingResponse:
+
+        response_json = response.json()
+        ## LOGGING
+        logging_obj.post_call(
+            input=input,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=response_json,
+        )
+        """
+            response 
+            {
+                'object': "list",
+                'data': [
+                
+                ]
+                'model', 
+                'usage'
+            }
+        """
+        embeddings = response_json["embeddings"]
+        output_data = []
+        for idx, embedding in enumerate(embeddings):
+            output_data.append(
+                {"object": "embedding", "index": idx, "embedding": embedding}
+            )
+        model_response.object = "list"
+        model_response.data = output_data
+        model_response.model = model
+        input_tokens = 0
+        for text in input:
+            input_tokens += len(encoding.encode(text))
+
+        setattr(
+            model_response,
+            "usage",
+            self._calculate_usage(input, encoding, response_json.get("meta", {})),
+        )
+
+        return model_response
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -1,7 +1,7 @@
 import asyncio
 import os
 import traceback
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Mapping, Optional, Union

 import httpx
 from httpx import USE_CLIENT_DEFAULT
@ -32,15 +32,20 @@ class AsyncHTTPHandler:
    def __init__(
        self,
        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        event_hooks: Optional[Mapping[str, List[Callable[..., Any]]]] = None,
        concurrent_limit=1000,
    ):
        self.timeout = timeout
+        self.event_hooks = event_hooks
        self.client = self.create_client(
-            timeout=timeout, concurrent_limit=concurrent_limit
+            timeout=timeout, concurrent_limit=concurrent_limit, event_hooks=event_hooks
        )

    def create_client(
-        self, timeout: Optional[Union[float, httpx.Timeout]], concurrent_limit: int
+        self,
+        timeout: Optional[Union[float, httpx.Timeout]],
+        concurrent_limit: int,
+        event_hooks: Optional[Mapping[str, List[Callable[..., Any]]]],
    ) -> httpx.AsyncClient:

        # SSL certificates (a.k.a CA bundle) used to verify the identity of requested hosts.
@ -55,6 +60,7 @@ class AsyncHTTPHandler:
        # Create a client with a connection pool

        return httpx.AsyncClient(
+            event_hooks=event_hooks,
            timeout=timeout,
            limits=httpx.Limits(
                max_connections=concurrent_limit,
@ -114,7 +120,9 @@ class AsyncHTTPHandler:
            return response
        except (httpx.RemoteProtocolError, httpx.ConnectError):
            # Retry the request with a new session if there is a connection error
-            new_client = self.create_client(timeout=timeout, concurrent_limit=1)
+            new_client = self.create_client(
+                timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
+            )
            try:
                return await self.single_connection_post_request(
                    url=url,
@ -144,8 +152,10 @@ class AsyncHTTPHandler:
            setattr(e, "status_code", e.response.status_code)
            if stream is True:
                setattr(e, "message", await e.response.aread())
+                setattr(e, "text", await e.response.aread())
            else:
                setattr(e, "message", e.response.text)
+                setattr(e, "text", e.response.text)
            raise e
        except Exception as e:
            raise e
@ -172,7 +182,9 @@ class AsyncHTTPHandler:
            return response
        except (httpx.RemoteProtocolError, httpx.ConnectError):
            # Retry the request with a new session if there is a connection error
-            new_client = self.create_client(timeout=timeout, concurrent_limit=1)
+            new_client = self.create_client(
+                timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
+            )
            try:
                return await self.single_connection_post_request(
                    url=url,
@ -229,7 +241,9 @@ class AsyncHTTPHandler:
            return response
        except (httpx.RemoteProtocolError, httpx.ConnectError):
            # Retry the request with a new session if there is a connection error
-            new_client = self.create_client(timeout=timeout, concurrent_limit=1)
+            new_client = self.create_client(
+                timeout=timeout, concurrent_limit=1, event_hooks=self.event_hooks
+            )
            try:
                return await self.single_connection_post_request(
                    url=url,
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -398,6 +398,7 @@ def ollama_completion_stream(url, data, logging_obj):
                        isinstance(content_chunk, StreamingChoices)
                        and hasattr(content_chunk, "delta")
                        and hasattr(content_chunk.delta, "content")
+                        and content_chunk.delta.content is not None
                    ):
                        content_chunks.append(content_chunk.delta.content)
                response_content = "".join(content_chunks)
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -2429,6 +2429,15 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
    contents: List[BedrockMessageBlock] = []
    msg_i = 0

+    ## BASE CASE ##
+    if len(messages) == 0:
+        raise litellm.BadRequestError(
+            message=BAD_MESSAGE_ERROR_STR
+            + "bedrock requires at least one non-system message",
+            model=model,
+            llm_provider=llm_provider,
+        )
+
    # if initial message is assistant message
    if messages[0].get("role") is not None and messages[0]["role"] == "assistant":
        if user_continue_message is not None:
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/anthropic/transformation.py
@ -177,3 +177,16 @@ class VertexAIAnthropicConfig:
                optional_params["json_mode"] = True

        return optional_params
+
+    @classmethod
+    def is_supported_model(
+        cls, model: str, custom_llm_provider: Optional[str] = None
+    ) -> bool:
+        """
+        Check if the model is supported by the VertexAI Anthropic API.
+        """
+        if custom_llm_provider == "vertex_ai" and "claude" in model.lower():
+            return True
+        elif model in litellm.vertex_anthropic_models:
+            return True
+        return False
--- a/litellm/main.py
+++ b/litellm/main.py
@ -113,7 +113,7 @@ from .llms.bedrock.chat import BedrockConverseLLM, BedrockLLM
 from .llms.bedrock.embed.embedding import BedrockEmbedding
 from .llms.cohere import chat as cohere_chat
 from .llms.cohere import completion as cohere_completion  # type: ignore
-from .llms.cohere import embed as cohere_embed
+from .llms.cohere.embed import handler as cohere_embed
 from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks.chat import DatabricksChatCompletion
 from .llms.groq.chat.handler import GroqChatCompletion
@ -4986,7 +4986,6 @@ def speech(
    litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
    proxy_server_request = kwargs.get("proxy_server_request", None)
    model_info = kwargs.get("model_info", None)
-    metadata = kwargs.get("metadata", {})
    model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
    kwargs.pop("tags", [])

--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1104,7 +1104,7 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
-    "azure_ai/Meta-Llama-31-8B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-8B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
    },
-    "azure_ai/Meta-Llama-31-70B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-70B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
    },
-    "azure_ai/Meta-Llama-31-405B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-405B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1751,6 +1751,22 @@
        "supports_assistant_prefill": true,
        "supports_prompt_caching": true
    },
+    "claude-3-5-sonnet-20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
+        "litellm_provider": "anthropic",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "tool_use_system_prompt_tokens": 159,
+        "supports_assistant_prefill": true,
+        "supports_prompt_caching": true
+    },
    "text-bison": {
        "max_tokens": 2048,
        "max_input_tokens": 8192,
@ -2578,6 +2594,18 @@
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
+    "vertex_ai/claude-3-5-sonnet-v2@20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_assistant_prefill": true
+    },
    "vertex_ai/claude-3-haiku@20240307": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -3336,54 +3364,56 @@
        "litellm_provider": "cohere",
        "mode": "rerank"
    },
-    "embed-english-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
-        "input_cost_per_token": 0.00000010,
-        "output_cost_per_token": 0.00000,
-        "litellm_provider": "cohere",
-        "mode": "embedding"
-    },
    "embed-english-light-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 4096, 
+        "max_input_tokens": 4096,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-light-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v2.0": {
-        "max_tokens": 256, 
-        "max_input_tokens": 256,
+        "max_tokens": 768, 
+        "max_input_tokens": 768,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
+    "embed-english-v3.0": {
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
+        "input_cost_per_token": 0.00000010,
+        "input_cost_per_image": 0.0001,
+        "output_cost_per_token": 0.00000,
+        "litellm_provider": "cohere",
+        "mode": "embedding",
+        "supports_image_input": true
+    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
@ -3572,6 +3602,22 @@
        "supports_vision": true,
        "tool_use_system_prompt_tokens": 264
    },
+    "anthropic/claude-3-5-sonnet-20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
+        "litellm_provider": "anthropic",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "tool_use_system_prompt_tokens": 159,
+        "supports_assistant_prefill": true,
+        "supports_prompt_caching": true
+    },
    "openrouter/anthropic/claude-3.5-sonnet": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
@ -4093,6 +4139,18 @@
        "litellm_provider": "bedrock", 
        "mode": "embedding"
    },
+    "amazon.titan-embed-image-v1": {
+        "max_tokens": 128, 
+        "max_input_tokens": 128, 
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000008,
+        "input_cost_per_image": 0.00006,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "bedrock", 
+        "supports_image_input": true,
+        "mode": "embedding",
+        "source": "https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=amazon.titan-image-generator-v1"
+    },
    "mistral.mistral-7b-instruct-v0:2": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -4246,6 +4304,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -4290,6 +4359,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "us.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
@ -4334,6 +4414,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "eu.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
@ -6369,6 +6460,14 @@
        "litellm_provider": "voyage",
        "mode": "embedding"
    },
+    "voyage/voyage-finance-2": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
    "databricks/databricks-meta-llama-3-1-405b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7b75dc53f1c6e449.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7b75dc53f1c6e449.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7c218fb97a2a9817.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7c218fb97a2a9817.js
--- a/litellm/proxy/_experimental/out/_next/static/ffXp7j1jzMKpweBFKW_w2/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/ffXp7j1jzMKpweBFKW_w2/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/ffXp7j1jzMKpweBFKW_w2/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/ffXp7j1jzMKpweBFKW_w2/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00256a1984d35914.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[68031,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-58bf23027703b2e8.js\",\"131\",\"static/chunks/131-3d2257b0ff5aadb2.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-fc3969bfc35ead00.js\",\"777\",\"static/chunks/777-a81b45dec53652df.js\",\"931\",\"static/chunks/app/page-7c218fb97a2a9817.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00256a1984d35914.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Q5YcBgN0qLD3pcZcx1fRm\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_86ef86\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f593049e31b05aeb.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-8316d07d1f41e39f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-e8ad0a25b0c46e0b.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/a34f9d1faa5f3315-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00256a1984d35914.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[68031,[\"665\",\"static/chunks/3014691f-b24e8254c7593934.js\",\"936\",\"static/chunks/2f6dbc85-cac2949a76539886.js\",\"902\",\"static/chunks/902-58bf23027703b2e8.js\",\"131\",\"static/chunks/131-3d2257b0ff5aadb2.js\",\"684\",\"static/chunks/684-16b194c83a169f6d.js\",\"626\",\"static/chunks/626-fc3969bfc35ead00.js\",\"777\",\"static/chunks/777-a81b45dec53652df.js\",\"931\",\"static/chunks/app/page-7b75dc53f1c6e449.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00256a1984d35914.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"ffXp7j1jzMKpweBFKW_w2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_86ef86\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[68031,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-fc3969bfc35ead00.js","777","static/chunks/777-a81b45dec53652df.js","931","static/chunks/app/page-7c218fb97a2a9817.js"],""]
+3:I[68031,["665","static/chunks/3014691f-b24e8254c7593934.js","936","static/chunks/2f6dbc85-cac2949a76539886.js","902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","684","static/chunks/684-16b194c83a169f6d.js","626","static/chunks/626-fc3969bfc35ead00.js","777","static/chunks/777-a81b45dec53652df.js","931","static/chunks/app/page-7b75dc53f1c6e449.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -2,6 +2,6 @@
 3:I[87494,["902","static/chunks/902-58bf23027703b2e8.js","131","static/chunks/131-3d2257b0ff5aadb2.js","777","static/chunks/777-a81b45dec53652df.js","418","static/chunks/app/model_hub/page-8ed460f3f33c0bf2.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -2,6 +2,6 @@
 3:I[667,["665","static/chunks/3014691f-b24e8254c7593934.js","902","static/chunks/902-58bf23027703b2e8.js","684","static/chunks/684-16b194c83a169f6d.js","777","static/chunks/777-a81b45dec53652df.js","461","static/chunks/app/onboarding/page-cba59362096ed469.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["Q5YcBgN0qLD3pcZcx1fRm",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["ffXp7j1jzMKpweBFKW_w2",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_86ef86","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00256a1984d35914.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,8 +1,50 @@
 model_list:
-    - model_name: gpt-3.5-turbo
-      litellm_params:
-        model: gpt-3.5-turbo
-        api_key: os.environ/OPENAI_API_KEY
+  - model_name: gpt-4o
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/

 litellm_settings:
-  callbacks: ["prometheus"]
+  callbacks: ["prometheus", "otel"]
+
+general_settings:
+  user_api_key_cache_ttl: 3600
+
+router_settings:
+  routing_strategy: latency-based-routing
+  routing_strategy_args:
+    # only assign 40% of traffic to the fastest deployment to avoid overloading it
+    lowest_latency_buffer: 0.4
+
+    # consider last five minutes of calls for latency calculation
+    ttl: 300
+
+  # model_group_alias:
+  #   gpt-4o: gpt-4o-128k-2024-05-13
+  #   gpt-4o-mini: gpt-4o-mini-128k-2024-07-18
+
+  enable_tag_filtering: True
+
+  # retry call 3 times on each model_name (we don't use fallbacks, so this would be 3 times total)
+  num_retries: 3
+
+  # -- cooldown settings --
+  # see https://github.com/BerriAI/litellm/blob/main/litellm/router_utils/cooldown_handlers.py#L265
+
+  # cooldown model if it fails > n calls in a minute.
+  allowed_fails: 2
+
+  # (in seconds) how long to cooldown model if fails/min > allowed_fails
+  cooldown_time: 60
+
+  allowed_fails_policy:
+    InternalServerErrorAllowedFails: 1
+    RateLimitErrorAllowedFails: 2
+    TimeoutErrorAllowedFails: 3
+  # -- end cooldown settings --
+
+  # see https://docs.litellm.ai/docs/proxy/prod#3-use-redis-porthost-password-not-redis_url
+  redis_host: os.environ/REDIS_HOST
+  redis_port: os.environ/REDIS_PORT
+  redis_password: os.environ/REDIS_PASSWORD
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -104,7 +104,7 @@ class LitellmUserRoles(str, enum.Enum):
        return ui_labels.get(self.value, "")


-class LitellmTableNames(enum.Enum):
+class LitellmTableNames(str, enum.Enum):
    """
    Enum for Table Names used by LiteLLM
    """
@ -340,6 +340,7 @@ class LiteLLMRoutes(enum.Enum):
        "/sso/get/ui_settings",
        "/login",
        "/key/generate",
+        "/key/{token_id}/regenerate",
        "/key/update",
        "/key/info",
        "/key/delete",
@ -1371,6 +1372,8 @@ class LiteLLM_VerificationToken(LiteLLMBase):
    blocked: Optional[bool] = None
    litellm_budget_table: Optional[dict] = None
    org_id: Optional[str] = None  # org id for a given key
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None

    model_config = ConfigDict(protected_namespaces=())

--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -28,7 +28,7 @@ from litellm.proxy._types import (
    LitellmUserRoles,
    UserAPIKeyAuth,
 )
-from litellm.proxy.auth.route_checks import is_llm_api_route
+from litellm.proxy.auth.route_checks import RouteChecks
 from litellm.proxy.utils import PrismaClient, ProxyLogging, log_to_opentelemetry
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes

@ -138,7 +138,7 @@ def common_checks(  # noqa: PLR0915
        general_settings.get("enforce_user_param", None) is not None
        and general_settings["enforce_user_param"] is True
    ):
-        if is_llm_api_route(route=route) and "user" not in request_body:
+        if RouteChecks.is_llm_api_route(route=route) and "user" not in request_body:
            raise Exception(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
@ -154,7 +154,7 @@ def common_checks(  # noqa: PLR0915
                + CommonProxyErrors.not_premium_user.value
            )

-        if is_llm_api_route(route=route):
+        if RouteChecks.is_llm_api_route(route=route):
            # loop through each enforced param
            # example enforced_params ['user', 'metadata', 'metadata.generation_name']
            for enforced_param in general_settings["enforced_params"]:
@ -182,7 +182,7 @@ def common_checks(  # noqa: PLR0915
        and global_proxy_spend is not None
        # only run global budget checks for OpenAI routes
        # Reason - the Admin UI should continue working if the proxy crosses it's global budget
-        and is_llm_api_route(route=route)
+        and RouteChecks.is_llm_api_route(route=route)
        and route != "/v1/models"
        and route != "/models"
    ):
--- a/litellm/proxy/auth/route_checks.py
+++ b/litellm/proxy/auth/route_checks.py
@ -17,175 +17,199 @@ from .auth_checks_organization import _user_is_org_admin
 from .auth_utils import _has_user_setup_sso


-def non_proxy_admin_allowed_routes_check(
-    user_obj: Optional[LiteLLM_UserTable],
-    _user_role: Optional[LitellmUserRoles],
-    route: str,
-    request: Request,
-    valid_token: UserAPIKeyAuth,
-    api_key: str,
-    request_data: dict,
-):
-    """
-    Checks if Non Proxy Admin User is allowed to access the route
-    """
+class RouteChecks:

-    # Check user has defined custom admin routes
-    custom_admin_only_route_check(
-        route=route,
-    )
-
-    if is_llm_api_route(route=route):
-        pass
-    elif (
-        route in LiteLLMRoutes.info_routes.value
-    ):  # check if user allowed to call an info route
-        if route == "/key/info":
-            # check if user can access this route
-            query_params = request.query_params
-            key = query_params.get("key")
-            if key is not None and hash_token(token=key) != api_key:
-                raise HTTPException(
-                    status_code=status.HTTP_403_FORBIDDEN,
-                    detail="user not allowed to access this key's info",
-                )
-        elif route == "/user/info":
-            # check if user can access this route
-            query_params = request.query_params
-            user_id = query_params.get("user_id")
-            verbose_proxy_logger.debug(
-                f"user_id: {user_id} & valid_token.user_id: {valid_token.user_id}"
-            )
-            if user_id and user_id != valid_token.user_id:
-                raise HTTPException(
-                    status_code=status.HTTP_403_FORBIDDEN,
-                    detail="key not allowed to access this user's info. user_id={}, key's user_id={}".format(
-                        user_id, valid_token.user_id
-                    ),
-                )
-        elif route == "/model/info":
-            # /model/info just shows models user has access to
-            pass
-        elif route == "/team/info":
-            pass  # handled by function itself
-    elif _has_user_setup_sso() and route in LiteLLMRoutes.sso_only_routes.value:
-        pass
-    elif (
-        route in LiteLLMRoutes.global_spend_tracking_routes.value
-        and getattr(valid_token, "permissions", None) is not None
-        and "get_spend_routes" in getattr(valid_token, "permissions", [])
+    @staticmethod
+    def non_proxy_admin_allowed_routes_check(
+        user_obj: Optional[LiteLLM_UserTable],
+        _user_role: Optional[LitellmUserRoles],
+        route: str,
+        request: Request,
+        valid_token: UserAPIKeyAuth,
+        api_key: str,
+        request_data: dict,
    ):
+        """
+        Checks if Non Proxy Admin User is allowed to access the route
+        """

-        pass
-    elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
-        if is_llm_api_route(route=route):
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
-            )
-        if route in LiteLLMRoutes.management_routes.value:
-            # the Admin Viewer is only allowed to call /user/update for their own user_id and can only update
-            if route == "/user/update":
-
-                # Check the Request params are valid for PROXY_ADMIN_VIEW_ONLY
-                if request_data is not None and isinstance(request_data, dict):
-                    _params_updated = request_data.keys()
-                    for param in _params_updated:
-                        if param not in ["user_email", "password"]:
-                            raise HTTPException(
-                                status_code=status.HTTP_403_FORBIDDEN,
-                                detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route} and updating invalid param: {param}. only user_email and password can be updated",
-                            )
-            else:
-                raise HTTPException(
-                    status_code=status.HTTP_403_FORBIDDEN,
-                    detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route}",
-                )
-
-    elif (
-        _user_role == LitellmUserRoles.INTERNAL_USER.value
-        and route in LiteLLMRoutes.internal_user_routes.value
-    ):
-        pass
-    elif (
-        _user_is_org_admin(request_data=request_data, user_object=user_obj)
-        and route in LiteLLMRoutes.org_admin_allowed_routes.value
-    ):
-        pass
-    elif (
-        _user_role == LitellmUserRoles.INTERNAL_USER_VIEW_ONLY.value
-        and route in LiteLLMRoutes.internal_user_view_only_routes.value
-    ):
-        pass
-    elif (
-        route in LiteLLMRoutes.self_managed_routes.value
-    ):  # routes that manage their own allowed/disallowed logic
-        pass
-    else:
-        user_role = "unknown"
-        user_id = "unknown"
-        if user_obj is not None:
-            user_role = user_obj.user_role or "unknown"
-            user_id = user_obj.user_id or "unknown"
-        raise Exception(
-            f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
+        # Check user has defined custom admin routes
+        RouteChecks.custom_admin_only_route_check(
+            route=route,
        )

+        if RouteChecks.is_llm_api_route(route=route):
+            pass
+        elif (
+            route in LiteLLMRoutes.info_routes.value
+        ):  # check if user allowed to call an info route
+            if route == "/key/info":
+                # check if user can access this route
+                query_params = request.query_params
+                key = query_params.get("key")
+                if key is not None and hash_token(token=key) != api_key:
+                    raise HTTPException(
+                        status_code=status.HTTP_403_FORBIDDEN,
+                        detail="user not allowed to access this key's info",
+                    )
+            elif route == "/user/info":
+                # check if user can access this route
+                query_params = request.query_params
+                user_id = query_params.get("user_id")
+                verbose_proxy_logger.debug(
+                    f"user_id: {user_id} & valid_token.user_id: {valid_token.user_id}"
+                )
+                if user_id and user_id != valid_token.user_id:
+                    raise HTTPException(
+                        status_code=status.HTTP_403_FORBIDDEN,
+                        detail="key not allowed to access this user's info. user_id={}, key's user_id={}".format(
+                            user_id, valid_token.user_id
+                        ),
+                    )
+            elif route == "/model/info":
+                # /model/info just shows models user has access to
+                pass
+            elif route == "/team/info":
+                pass  # handled by function itself
+        elif _has_user_setup_sso() and route in LiteLLMRoutes.sso_only_routes.value:
+            pass
+        elif (
+            route in LiteLLMRoutes.global_spend_tracking_routes.value
+            and getattr(valid_token, "permissions", None) is not None
+            and "get_spend_routes" in getattr(valid_token, "permissions", [])
+        ):

-def custom_admin_only_route_check(route: str):
-    from litellm.proxy.proxy_server import general_settings, premium_user
+            pass
+        elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
+            if RouteChecks.is_llm_api_route(route=route):
+                raise HTTPException(
+                    status_code=status.HTTP_403_FORBIDDEN,
+                    detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",
+                )
+            if route in LiteLLMRoutes.management_routes.value:
+                # the Admin Viewer is only allowed to call /user/update for their own user_id and can only update
+                if route == "/user/update":

-    if "admin_only_routes" in general_settings:
-        if premium_user is not True:
-            verbose_proxy_logger.error(
-                f"Trying to use 'admin_only_routes' this is an Enterprise only feature. {CommonProxyErrors.not_premium_user.value}"
+                    # Check the Request params are valid for PROXY_ADMIN_VIEW_ONLY
+                    if request_data is not None and isinstance(request_data, dict):
+                        _params_updated = request_data.keys()
+                        for param in _params_updated:
+                            if param not in ["user_email", "password"]:
+                                raise HTTPException(
+                                    status_code=status.HTTP_403_FORBIDDEN,
+                                    detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route} and updating invalid param: {param}. only user_email and password can be updated",
+                                )
+                else:
+                    raise HTTPException(
+                        status_code=status.HTTP_403_FORBIDDEN,
+                        detail=f"user not allowed to access this route, role= {_user_role}. Trying to access: {route}",
+                    )
+
+        elif (
+            _user_role == LitellmUserRoles.INTERNAL_USER.value
+            and route in LiteLLMRoutes.internal_user_routes.value
+        ):
+            pass
+        elif (
+            _user_is_org_admin(request_data=request_data, user_object=user_obj)
+            and route in LiteLLMRoutes.org_admin_allowed_routes.value
+        ):
+            pass
+        elif (
+            _user_role == LitellmUserRoles.INTERNAL_USER_VIEW_ONLY.value
+            and route in LiteLLMRoutes.internal_user_view_only_routes.value
+        ):
+            pass
+        elif (
+            route in LiteLLMRoutes.self_managed_routes.value
+        ):  # routes that manage their own allowed/disallowed logic
+            pass
+        else:
+            user_role = "unknown"
+            user_id = "unknown"
+            if user_obj is not None:
+                user_role = user_obj.user_role or "unknown"
+                user_id = user_obj.user_id or "unknown"
+            raise Exception(
+                f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
            )
-            return
-        if route in general_settings["admin_only_routes"]:
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail=f"user not allowed to access this route. Route={route} is an admin only route",
-            )
-    pass
+
+    @staticmethod
+    def custom_admin_only_route_check(route: str):
+        from litellm.proxy.proxy_server import general_settings, premium_user
+
+        if "admin_only_routes" in general_settings:
+            if premium_user is not True:
+                verbose_proxy_logger.error(
+                    f"Trying to use 'admin_only_routes' this is an Enterprise only feature. {CommonProxyErrors.not_premium_user.value}"
+                )
+                return
+            if route in general_settings["admin_only_routes"]:
+                raise HTTPException(
+                    status_code=status.HTTP_403_FORBIDDEN,
+                    detail=f"user not allowed to access this route. Route={route} is an admin only route",
+                )
+        pass
+
+    @staticmethod
+    def is_llm_api_route(route: str) -> bool:
+        """
+        Helper to checks if provided route is an OpenAI route


-def is_llm_api_route(route: str) -> bool:
-    """
-    Helper to checks if provided route is an OpenAI route
+        Returns:
+            - True: if route is an OpenAI route
+            - False: if route is not an OpenAI route
+        """
+
+        if route in LiteLLMRoutes.openai_routes.value:
+            return True
+
+        if route in LiteLLMRoutes.anthropic_routes.value:
+            return True
+
+        # fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
+        # Check for routes with placeholders
+        for openai_route in LiteLLMRoutes.openai_routes.value:
+            # Replace placeholders with regex pattern
+            # placeholders are written as "/threads/{thread_id}"
+            if "{" in openai_route:
+                if RouteChecks._route_matches_pattern(
+                    route=route, pattern=openai_route
+                ):
+                    return True
+
+        # Pass through Bedrock, VertexAI, and Cohere Routes
+        if "/bedrock/" in route:
+            return True
+        if "/vertex-ai/" in route:
+            return True
+        if "/gemini/" in route:
+            return True
+        if "/cohere/" in route:
+            return True
+        if "/langfuse/" in route:
+            return True
+        return False
+
+    @staticmethod
+    def _route_matches_pattern(route: str, pattern: str) -> bool:
+        """
+        Check if route matches the pattern placed in proxy/_types.py
+
+        Example:
+        - pattern: "/threads/{thread_id}"
+        - route: "/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
+        - returns: True


-    Returns:
-        - True: if route is an OpenAI route
-        - False: if route is not an OpenAI route
-    """
-
-    if route in LiteLLMRoutes.openai_routes.value:
-        return True
-
-    if route in LiteLLMRoutes.anthropic_routes.value:
-        return True
-
-    # fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
-    # Check for routes with placeholders
-    for openai_route in LiteLLMRoutes.openai_routes.value:
-        # Replace placeholders with regex pattern
-        # placeholders are written as "/threads/{thread_id}"
-        if "{" in openai_route:
-            pattern = re.sub(r"\{[^}]+\}", r"[^/]+", openai_route)
-            # Anchor the pattern to match the entire string
-            pattern = f"^{pattern}$"
-            if re.match(pattern, route):
-                return True
-
-    # Pass through Bedrock, VertexAI, and Cohere Routes
-    if "/bedrock/" in route:
-        return True
-    if "/vertex-ai/" in route:
-        return True
-    if "/gemini/" in route:
-        return True
-    if "/cohere/" in route:
-        return True
-    if "/langfuse/" in route:
-        return True
-    return False
+        - pattern: "/key/{token_id}/regenerate"
+        - route: "/key/regenerate/82akk800000000jjsk"
+        - returns: False, pattern is "/key/{token_id}/regenerate"
+        """
+        pattern = re.sub(r"\{[^}]+\}", r"[^/]+", pattern)
+        # Anchor the pattern to match the entire string
+        pattern = f"^{pattern}$"
+        if re.match(pattern, route):
+            return True
+        return False
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -69,7 +69,7 @@ from litellm.proxy.auth.auth_utils import (
 )
 from litellm.proxy.auth.oauth2_check import check_oauth2_token
 from litellm.proxy.auth.oauth2_proxy_hook import handle_oauth2_proxy_request
-from litellm.proxy.auth.route_checks import non_proxy_admin_allowed_routes_check
+from litellm.proxy.auth.route_checks import RouteChecks
 from litellm.proxy.auth.service_account_checks import service_account_checks
 from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
 from litellm.proxy.utils import _to_ns
@ -122,6 +122,11 @@ def _is_ui_route_allowed(
    ):
        # Do something if the current route starts with any of the allowed routes
        return True
+    elif any(
+        RouteChecks._route_matches_pattern(route=route, pattern=allowed_route)
+        for allowed_route in allowed_routes
+    ):
+        return True
    else:
        if user_obj is not None and _is_user_proxy_admin(user_obj=user_obj):
            return True
@ -150,7 +155,7 @@ def _is_api_route_allowed(
        raise Exception("Invalid proxy server token passed")

    if not _is_user_proxy_admin(user_obj=user_obj):  # if non-admin
-        non_proxy_admin_allowed_routes_check(
+        RouteChecks.non_proxy_admin_allowed_routes_check(
            user_obj=user_obj,
            _user_role=_user_role,
            route=route,
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -120,7 +120,7 @@ async def health_services_endpoint(  # noqa: PLR0915
            }

        if service == "langfuse":
-            from litellm.integrations.langfuse import LangFuseLogger
+            from litellm.integrations.langfuse.langfuse import LangFuseLogger

            langfuse_logger = LangFuseLogger()
            langfuse_logger.Langfuse.auth_check()
@ -372,6 +372,11 @@ async def _db_health_readiness_check():
    return db_health_cache


+@router.get(
+    "/settings",
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+)
@router.get(
    "/active/callbacks",
    tags=["health"],
@ -379,8 +384,29 @@ async def _db_health_readiness_check():
 )
 async def active_callbacks():
    """
-    Returns a list of active callbacks on litellm.callbacks, litellm.input_callback, litellm.failure_callback, litellm.success_callback
+    Returns a list of litellm level settings
+
+    This is useful for debugging and ensuring the proxy server is configured correctly.
+
+    Response schema:
+    ```
+    {
+        "alerting": _alerting,
+        "litellm.callbacks": litellm_callbacks,
+        "litellm.input_callback": litellm_input_callbacks,
+        "litellm.failure_callback": litellm_failure_callbacks,
+        "litellm.success_callback": litellm_success_callbacks,
+        "litellm._async_success_callback": litellm_async_success_callbacks,
+        "litellm._async_failure_callback": litellm_async_failure_callbacks,
+        "litellm._async_input_callback": litellm_async_input_callbacks,
+        "all_litellm_callbacks": all_litellm_callbacks,
+        "num_callbacks": len(all_litellm_callbacks),
+        "num_alerting": _num_alerting,
+        "litellm.request_timeout": litellm.request_timeout,
+    }
+    ```
    """
+
    from litellm.proxy.proxy_server import general_settings, proxy_logging_obj

    _alerting = str(general_settings.get("alerting"))
@ -421,6 +447,7 @@ async def active_callbacks():
        "all_litellm_callbacks": all_litellm_callbacks,
        "num_callbacks": len(all_litellm_callbacks),
        "num_alerting": _num_alerting,
+        "litellm.request_timeout": litellm.request_timeout,
    }


--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -16,7 +16,10 @@ from litellm.proxy._types import (
    UserAPIKeyAuth,
 )
 from litellm.proxy.auth.auth_utils import get_request_route
-from litellm.types.utils import SupportedCacheControls
+from litellm.types.utils import (
+    StandardLoggingUserAPIKeyMetadata,
+    SupportedCacheControls,
+)

 if TYPE_CHECKING:
    from litellm.proxy.proxy_server import ProxyConfig as _ProxyConfig
@ -159,56 +162,111 @@ def clean_headers(
    return clean_headers


-def get_forwardable_headers(
-    headers: Union[Headers, dict],
-):
-    """
-    Get the headers that should be forwarded to the LLM Provider.
-
-    Looks for any `x-` headers and sends them to the LLM Provider.
-    """
-    forwarded_headers = {}
-    for header, value in headers.items():
-        if header.lower().startswith("x-") and not header.lower().startswith(
-            "x-stainless"
-        ):  # causes openai sdk to fail
-            forwarded_headers[header] = value
-
-    return forwarded_headers
-
-
-def get_openai_org_id_from_headers(
-    headers: dict, general_settings: Optional[Dict] = None
-) -> Optional[str]:
-    """
-    Get the OpenAI Org ID from the headers.
-    """
-    if (
-        general_settings is not None
-        and general_settings.get("forward_openai_org_id") is not True
+class LiteLLMProxyRequestSetup:
+    @staticmethod
+    def _get_forwardable_headers(
+        headers: Union[Headers, dict],
    ):
+        """
+        Get the headers that should be forwarded to the LLM Provider.
+
+        Looks for any `x-` headers and sends them to the LLM Provider.
+        """
+        forwarded_headers = {}
+        for header, value in headers.items():
+            if header.lower().startswith("x-") and not header.lower().startswith(
+                "x-stainless"
+            ):  # causes openai sdk to fail
+                forwarded_headers[header] = value
+
+        return forwarded_headers
+
+    @staticmethod
+    def get_openai_org_id_from_headers(
+        headers: dict, general_settings: Optional[Dict] = None
+    ) -> Optional[str]:
+        """
+        Get the OpenAI Org ID from the headers.
+        """
+        if (
+            general_settings is not None
+            and general_settings.get("forward_openai_org_id") is not True
+        ):
+            return None
+        for header, value in headers.items():
+            if header.lower() == "openai-organization":
+                return value
        return None
-    for header, value in headers.items():
-        if header.lower() == "openai-organization":
-            return value
-    return None

+    @staticmethod
+    def add_headers_to_llm_call(
+        headers: dict, user_api_key_dict: UserAPIKeyAuth
+    ) -> dict:
+        """
+        Add headers to the LLM call

-def add_litellm_data_for_backend_llm_call(
-    headers: dict, general_settings: Optional[Dict[str, Any]] = None
-) -> LitellmDataForBackendLLMCall:
-    """
-    - Adds forwardable headers
-    - Adds org id
-    """
-    data = LitellmDataForBackendLLMCall()
-    _headers = get_forwardable_headers(headers)
-    if _headers != {}:
-        data["headers"] = _headers
-    _organization = get_openai_org_id_from_headers(headers, general_settings)
-    if _organization is not None:
-        data["organization"] = _organization
-    return data
+        - Checks request headers for forwardable headers
+        - Checks if user information should be added to the headers
+        """
+        from litellm.litellm_core_utils.litellm_logging import (
+            get_standard_logging_metadata,
+        )
+
+        returned_headers = LiteLLMProxyRequestSetup._get_forwardable_headers(headers)
+
+        if litellm.add_user_information_to_llm_headers is True:
+            litellm_logging_metadata_headers = (
+                LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
+                    user_api_key_dict=user_api_key_dict
+                )
+            )
+            for k, v in litellm_logging_metadata_headers.items():
+                if v is not None:
+                    returned_headers["x-litellm-{}".format(k)] = v
+
+        return returned_headers
+
+    @staticmethod
+    def add_litellm_data_for_backend_llm_call(
+        *,
+        headers: dict,
+        user_api_key_dict: UserAPIKeyAuth,
+        general_settings: Optional[Dict[str, Any]] = None,
+    ) -> LitellmDataForBackendLLMCall:
+        """
+        - Adds forwardable headers
+        - Adds org id
+        """
+        data = LitellmDataForBackendLLMCall()
+        if (
+            general_settings
+            and general_settings.get("forward_client_headers_to_llm_api") is True
+        ):
+            _headers = LiteLLMProxyRequestSetup.add_headers_to_llm_call(
+                headers, user_api_key_dict
+            )
+            if _headers != {}:
+                data["headers"] = _headers
+        _organization = LiteLLMProxyRequestSetup.get_openai_org_id_from_headers(
+            headers, general_settings
+        )
+        if _organization is not None:
+            data["organization"] = _organization
+        return data
+
+    @staticmethod
+    def get_sanitized_user_information_from_key(
+        user_api_key_dict: UserAPIKeyAuth,
+    ) -> StandardLoggingUserAPIKeyMetadata:
+        user_api_key_logged_metadata = StandardLoggingUserAPIKeyMetadata(
+            user_api_key_hash=user_api_key_dict.api_key,  # just the hashed token
+            user_api_key_alias=user_api_key_dict.key_alias,
+            user_api_key_team_id=user_api_key_dict.team_id,
+            user_api_key_user_id=user_api_key_dict.user_id,
+            user_api_key_org_id=user_api_key_dict.org_id,
+            user_api_key_team_alias=user_api_key_dict.team_alias,
+        )
+        return user_api_key_logged_metadata


 async def add_litellm_data_to_request(  # noqa: PLR0915
@ -246,7 +304,13 @@ async def add_litellm_data_to_request(  # noqa: PLR0915
        ),
    )

-    data.update(add_litellm_data_for_backend_llm_call(_headers, general_settings))
+    data.update(
+        LiteLLMProxyRequestSetup.add_litellm_data_for_backend_llm_call(
+            headers=_headers,
+            user_api_key_dict=user_api_key_dict,
+            general_settings=general_settings,
+        )
+    )

    # Include original request and headers in the data
    data["proxy_server_request"] = {
@ -294,13 +358,22 @@ async def add_litellm_data_to_request(  # noqa: PLR0915
            data["metadata"]
        )

-    data[_metadata_variable_name]["user_api_key"] = user_api_key_dict.api_key
-    data[_metadata_variable_name]["user_api_key_alias"] = getattr(
-        user_api_key_dict, "key_alias", None
+    user_api_key_logged_metadata = (
+        LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
+            user_api_key_dict=user_api_key_dict
+        )
    )
+    data[_metadata_variable_name].update(user_api_key_logged_metadata)
+    data[_metadata_variable_name][
+        "user_api_key"
+    ] = (
+        user_api_key_dict.api_key
+    )  # this is just the hashed token. [TODO]: replace variable name in repo.
+
    data[_metadata_variable_name]["user_api_end_user_max_budget"] = getattr(
        user_api_key_dict, "end_user_max_budget", None
    )
+
    data[_metadata_variable_name]["litellm_api_version"] = version

    if general_settings is not None:
@ -308,15 +381,6 @@ async def add_litellm_data_to_request(  # noqa: PLR0915
            general_settings.get("global_max_parallel_requests", None)
        )

-    data[_metadata_variable_name]["user_api_key_user_id"] = user_api_key_dict.user_id
-    data[_metadata_variable_name]["user_api_key_org_id"] = user_api_key_dict.org_id
-    data[_metadata_variable_name]["user_api_key_team_id"] = getattr(
-        user_api_key_dict, "team_id", None
-    )
-    data[_metadata_variable_name]["user_api_key_team_alias"] = getattr(
-        user_api_key_dict, "team_alias", None
-    )
-
    ### KEY-LEVEL Controls
    key_metadata = user_api_key_dict.metadata
    if "cache" in key_metadata:
--- a/litellm/proxy/management_helpers/audit_logs.py
+++ b/litellm/proxy/management_helpers/audit_logs.py
@ -0,0 +1,43 @@
+"""
+Functions to create audit logs for LiteLLM Proxy
+"""
+
+import json
+
+import litellm
+from litellm._logging import verbose_proxy_logger
+from litellm.proxy._types import LiteLLM_AuditLogs
+
+
+async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
+    from litellm.proxy.proxy_server import premium_user, prisma_client
+
+    if premium_user is not True:
+        return
+
+    if litellm.store_audit_logs is not True:
+        return
+    if prisma_client is None:
+        raise Exception("prisma_client is None, no DB connected")
+
+    verbose_proxy_logger.debug("creating audit log for %s", request_data)
+
+    if isinstance(request_data.updated_values, dict):
+        request_data.updated_values = json.dumps(request_data.updated_values)
+
+    if isinstance(request_data.before_value, dict):
+        request_data.before_value = json.dumps(request_data.before_value)
+
+    _request_data = request_data.model_dump(exclude_none=True)
+
+    try:
+        await prisma_client.db.litellm_auditlog.create(
+            data={
+                **_request_data,  # type: ignore
+            }
+        )
+    except Exception as e:
+        # [Non-Blocking Exception. Do not allow blocking LLM API call]
+        verbose_proxy_logger.error(f"Failed Creating audit log {e}")
+
+    return
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -125,7 +125,7 @@ def is_port_in_use(port):
 )
@click.option(
    "--request_timeout",
-    default=600,
+    default=6000,
    type=int,
    help="Set timeout in seconds for completion calls",
 )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -1,48 +1,20 @@
 model_list:
-  ################################################################################
-  # Azure
-  - model_name: gpt-4o-mini
-    litellm_params:
-      model: azure/gpt-4o-mini
-      api_base: https://amazin-prod.openai.azure.com
-      api_key: "os.environ/AZURE_GPT_4O"
-      deployment_id: gpt-4o-mini
  - model_name: gpt-4o
    litellm_params:
-      model: azure/gpt-4o
-      api_base: https://very-cool-prod.openai.azure.com
-      api_key: "os.environ/AZURE_GPT_4O"
-      deployment_id: gpt-4o
+      model: gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+      tpm: 1000000
+      rpm: 10000
+

-  ################################################################################
-  # Fireworks
-  - model_name: fireworks-llama-v3p1-405b-instruct
-    litellm_params:
-      model: fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct
-      api_key: "os.environ/FIREWORKS"
-  - model_name: fireworks-llama-v3p1-70b-instruct
-    litellm_params:
-      model: fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct
-      api_key: "os.environ/FIREWORKS"
-  
 general_settings:
-  alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+
-litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
-  success_callback: ["prometheus"]
-  service_callback: ["prometheus_system"]
-  drop_params: False # Raise an exception if the openai param being passed in isn't supported.
-  cache: false
-  default_internal_user_params:
-    user_role: os.environ/DEFAULT_USER_ROLE
+  # master key is set via env var
+  # master_key: #######
+  proxy_batch_write_at: 60 # Batch write spend updates every 60s

-  success_callback: ["s3"]
-  s3_callback_params:
-    s3_bucket_name: logs-bucket-litellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
-    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
-    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
+litellm_settings:
+  store_audit_logs: true

-router_settings:
-  routing_strategy: simple-shuffle # "simple-shuffle" shown to result in highest throughput. https://docs.litellm.ai/docs/proxy/configs#load-balancing
+  # https://docs.litellm.ai/docs/proxy/reliability#default-fallbacks
+  default_fallbacks: ["gpt-4o-2024-08-06", "claude-3-5-sonnet-20240620"]
+  fallbacks: [{"gpt-4o-2024-08-06": ["claude-3-5-sonnet-20240620"]}, {"gpt-4o-2024-05-13": ["claude-3-5-sonnet-20240620"]}]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -194,6 +194,7 @@ from litellm.proxy.management_endpoints.team_callback_endpoints import (
 )
 from litellm.proxy.management_endpoints.team_endpoints import router as team_router
 from litellm.proxy.management_endpoints.ui_sso import router as ui_sso_router
+from litellm.proxy.management_helpers.audit_logs import create_audit_log_for_update
 from litellm.proxy.openai_files_endpoints.files_endpoints import is_known_model
 from litellm.proxy.openai_files_endpoints.files_endpoints import (
    router as openai_files_router,
@ -638,18 +639,6 @@ def _resolve_pydantic_type(typ) -> List:
    return typs


-def prisma_setup(database_url: Optional[str]):
-    global prisma_client, proxy_logging_obj, user_api_key_cache
-
-    if database_url is not None:
-        try:
-            prisma_client = PrismaClient(
-                database_url=database_url, proxy_logging_obj=proxy_logging_obj
-            )
-        except Exception as e:
-            raise e
-
-
 def load_from_azure_key_vault(use_azure_key_vault: bool = False):
    if use_azure_key_vault is False:
        return
@ -1548,7 +1537,7 @@ class ProxyConfig:
            ## INIT PROXY REDIS USAGE CLIENT ##
            redis_usage_cache = litellm.cache.cache

-
+    
    async def get_config(self, config_file_path: Optional[str] = None) -> dict:
        """
        Load config file
@ -2801,137 +2790,55 @@ def giveup(e):
    return result


-@router.on_event("startup")
-async def startup_event():  # noqa: PLR0915
-    global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time, litellm_proxy_admin_name, db_writer_client, store_model_in_db, premium_user, _license_check
-    import json
+class ProxyStartupEvent:
+    @classmethod
+    def _initialize_startup_logging(
+        cls,
+        llm_router: Optional[litellm.Router],
+        proxy_logging_obj: ProxyLogging,
+        redis_usage_cache: Optional[RedisCache],
+    ):
+        """Initialize logging and alerting on startup"""
+        ## COST TRACKING ##
+        cost_tracking()

-    init_verbose_loggers()
+        ## Error Tracking ##
+        error_tracking()

-    ### LOAD MASTER KEY ###
-    # check if master key set in environment - load from there
-    master_key = get_secret("LITELLM_MASTER_KEY", None)  # type: ignore
-    # check if DATABASE_URL in environment - load from there
-    if prisma_client is None:
-        _db_url: Optional[str] = get_secret("DATABASE_URL", None)  # type: ignore
-        prisma_setup(database_url=_db_url)
+        proxy_logging_obj.startup_event(
+            llm_router=llm_router, redis_usage_cache=redis_usage_cache
+        )

-    ### LOAD CONFIG ###
-    worker_config: Optional[Union[str, dict]] = get_secret("WORKER_CONFIG")  # type: ignore
-    env_config_yaml: Optional[str] = get_secret_str("CONFIG_FILE_PATH")
-    verbose_proxy_logger.debug("worker_config: %s", worker_config)
-    # check if it's a valid file path
-    if env_config_yaml is not None:
-        if os.path.isfile(env_config_yaml) and proxy_config.is_yaml(
-            config_file_path=env_config_yaml
-        ):
-            (
-                llm_router,
-                llm_model_list,
-                general_settings,
-            ) = await proxy_config.load_config(
-                router=llm_router, config_file_path=env_config_yaml
-            )
-    elif worker_config is not None:
-        if (
-            isinstance(worker_config, str)
-            and os.path.isfile(worker_config)
-            and proxy_config.is_yaml(config_file_path=worker_config)
-        ):
-            (
-                llm_router,
-                llm_model_list,
-                general_settings,
-            ) = await proxy_config.load_config(
-                router=llm_router, config_file_path=worker_config
-            )
-        elif os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None and isinstance(
-            worker_config, str
-        ):
-            (
-                llm_router,
-                llm_model_list,
-                general_settings,
-            ) = await proxy_config.load_config(
-                router=llm_router, config_file_path=worker_config
-            )
-        elif isinstance(worker_config, dict):
-            await initialize(**worker_config)
+    @classmethod
+    def _initialize_jwt_auth(
+        cls,
+        general_settings: dict,
+        prisma_client: Optional[PrismaClient],
+        user_api_key_cache: DualCache,
+    ):
+        """Initialize JWT auth on startup"""
+        if general_settings.get("litellm_jwtauth", None) is not None:
+            for k, v in general_settings["litellm_jwtauth"].items():
+                if isinstance(v, str) and v.startswith("os.environ/"):
+                    general_settings["litellm_jwtauth"][k] = get_secret(v)
+            litellm_jwtauth = LiteLLM_JWTAuth(**general_settings["litellm_jwtauth"])
        else:
-            # if not, assume it's a json string
-            worker_config = json.loads(worker_config)
-            if isinstance(worker_config, dict):
-                await initialize(**worker_config)
-
-    ## CHECK PREMIUM USER
-    verbose_proxy_logger.debug(
-        "litellm.proxy.proxy_server.py::startup() - CHECKING PREMIUM USER - {}".format(
-            premium_user
+            litellm_jwtauth = LiteLLM_JWTAuth()
+        jwt_handler.update_environment(
+            prisma_client=prisma_client,
+            user_api_key_cache=user_api_key_cache,
+            litellm_jwtauth=litellm_jwtauth,
        )
-    )
-    if premium_user is False:
-        premium_user = _license_check.is_premium()

-    verbose_proxy_logger.debug(
-        "litellm.proxy.proxy_server.py::startup() - PREMIUM USER value - {}".format(
-            premium_user
-        )
-    )
-
-    ## COST TRACKING ##
-    cost_tracking()
-
-    ## Error Tracking ##
-    error_tracking()
-
-    ## UPDATE SLACK ALERTING ##
-    proxy_logging_obj.slack_alerting_instance.update_values(llm_router=llm_router)
-
-    db_writer_client = HTTPHandler()
-
-    ## UPDATE INTERNAL USAGE CACHE ##
-    proxy_logging_obj.update_values(
-        redis_cache=redis_usage_cache
-    )  # used by parallel request limiter for rate limiting keys across instances
-
-    proxy_logging_obj._init_litellm_callbacks(
-        llm_router=llm_router
-    )  # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
-
-    if "daily_reports" in proxy_logging_obj.slack_alerting_instance.alert_types:
-        asyncio.create_task(
-            proxy_logging_obj.slack_alerting_instance._run_scheduled_daily_report(
-                llm_router=llm_router
-            )
-        )  # RUN DAILY REPORT (if scheduled)
-
-    ## JWT AUTH ##
-    if general_settings.get("litellm_jwtauth", None) is not None:
-        for k, v in general_settings["litellm_jwtauth"].items():
-            if isinstance(v, str) and v.startswith("os.environ/"):
-                general_settings["litellm_jwtauth"][k] = get_secret(v)
-        litellm_jwtauth = LiteLLM_JWTAuth(**general_settings["litellm_jwtauth"])
-    else:
-        litellm_jwtauth = LiteLLM_JWTAuth()
-    jwt_handler.update_environment(
-        prisma_client=prisma_client,
-        user_api_key_cache=user_api_key_cache,
-        litellm_jwtauth=litellm_jwtauth,
-    )
-
-    if use_background_health_checks:
-        asyncio.create_task(
-            _run_background_health_check()
-        )  # start the background health check coroutine.
-
-    if prompt_injection_detection_obj is not None:
-        prompt_injection_detection_obj.update_environment(router=llm_router)
-
-    verbose_proxy_logger.debug("prisma_client: %s", prisma_client)
-    if prisma_client is not None:
-        await prisma_client.connect()
-
-    if prisma_client is not None and master_key is not None:
+    @classmethod
+    def _add_master_key_hash_to_db(
+        cls,
+        master_key: str,
+        prisma_client: PrismaClient,
+        litellm_proxy_admin_name: str,
+        general_settings: dict,
+    ):
+        """Adds master key hash to db for cost tracking"""
        if os.getenv("PROXY_ADMIN_ID", None) is not None:
            litellm_proxy_admin_name = os.getenv(
                "PROXY_ADMIN_ID", litellm_proxy_admin_name
@ -2956,7 +2863,9 @@ async def startup_event():  # noqa: PLR0915
            )
            asyncio.create_task(task_1)

-    if prisma_client is not None and litellm.max_budget > 0:
+    @classmethod
+    def _add_proxy_budget_to_db(cls, litellm_proxy_budget_name: str):
+        """Adds a global proxy budget to db"""
        if litellm.budget_duration is None:
            raise Exception(
                "budget_duration not set on Proxy. budget_duration is required to use max_budget."
@ -2982,8 +2891,18 @@ async def startup_event():  # noqa: PLR0915
            )
        )

-    ### START BATCH WRITING DB + CHECKING NEW MODELS###
-    if prisma_client is not None:
+    @classmethod
+    async def initialize_scheduled_background_jobs(
+        cls,
+        general_settings: dict,
+        prisma_client: PrismaClient,
+        proxy_budget_rescheduler_min_time: int,
+        proxy_budget_rescheduler_max_time: int,
+        proxy_batch_write_at: int,
+        proxy_logging_obj: ProxyLogging,
+        store_model_in_db: bool,
+    ):
+        """Initializes scheduled background jobs"""
        scheduler = AsyncIOScheduler()
        interval = random.randint(
            proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time
@ -3072,6 +2991,165 @@ async def startup_event():  # noqa: PLR0915

        scheduler.start()

+    @classmethod
+    def _setup_prisma_client(
+        cls,
+        database_url: Optional[str],
+        proxy_logging_obj: ProxyLogging,
+        user_api_key_cache: DualCache,
+    ) -> Optional[PrismaClient]:
+        """
+        - Sets up prisma client
+        - Adds necessary views to proxy
+        """
+        prisma_client: Optional[PrismaClient] = None
+        if database_url is not None:
+            try:
+                prisma_client = PrismaClient(
+                    database_url=database_url, proxy_logging_obj=proxy_logging_obj
+                )
+            except Exception as e:
+                raise e
+
+            ## Add necessary views to proxy ##
+            asyncio.create_task(
+                prisma_client.check_view_exists()
+            )  # check if all necessary views exist. Don't block execution
+
+        return prisma_client
+
+
+@router.on_event("startup")
+async def startup_event():
+    global prisma_client, master_key, use_background_health_checks, llm_router, llm_model_list, general_settings, proxy_budget_rescheduler_min_time, proxy_budget_rescheduler_max_time, litellm_proxy_admin_name, db_writer_client, store_model_in_db, premium_user, _license_check
+    import json
+
+    init_verbose_loggers()
+
+    ### LOAD MASTER KEY ###
+    # check if master key set in environment - load from there
+    master_key = get_secret("LITELLM_MASTER_KEY", None)  # type: ignore
+    # check if DATABASE_URL in environment - load from there
+    if prisma_client is None:
+        _db_url: Optional[str] = get_secret("DATABASE_URL", None)  # type: ignore
+        prisma_client = ProxyStartupEvent._setup_prisma_client(
+            database_url=_db_url,
+            proxy_logging_obj=proxy_logging_obj,
+            user_api_key_cache=user_api_key_cache,
+        )
+
+    ### LOAD CONFIG ###
+    worker_config: Optional[Union[str, dict]] = get_secret("WORKER_CONFIG")  # type: ignore
+    env_config_yaml: Optional[str] = get_secret_str("CONFIG_FILE_PATH")
+    verbose_proxy_logger.debug("worker_config: %s", worker_config)
+    # check if it's a valid file path
+    if env_config_yaml is not None:
+        if os.path.isfile(env_config_yaml) and proxy_config.is_yaml(
+            config_file_path=env_config_yaml
+        ):
+            (
+                llm_router,
+                llm_model_list,
+                general_settings,
+            ) = await proxy_config.load_config(
+                router=llm_router, config_file_path=env_config_yaml
+            )
+    elif worker_config is not None:
+        if (
+            isinstance(worker_config, str)
+            and os.path.isfile(worker_config)
+            and proxy_config.is_yaml(config_file_path=worker_config)
+        ):
+            (
+                llm_router,
+                llm_model_list,
+                general_settings,
+            ) = await proxy_config.load_config(
+                router=llm_router, config_file_path=worker_config
+            )
+        elif os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None and isinstance(
+            worker_config, str
+        ):
+            (
+                llm_router,
+                llm_model_list,
+                general_settings,
+            ) = await proxy_config.load_config(
+                router=llm_router, config_file_path=worker_config
+            )
+        elif isinstance(worker_config, dict):
+            await initialize(**worker_config)
+        else:
+            # if not, assume it's a json string
+            worker_config = json.loads(worker_config)
+            if isinstance(worker_config, dict):
+                await initialize(**worker_config)
+
+    ## CHECK PREMIUM USER
+    verbose_proxy_logger.debug(
+        "litellm.proxy.proxy_server.py::startup() - CHECKING PREMIUM USER - {}".format(
+            premium_user
+        )
+    )
+    if premium_user is False:
+        premium_user = _license_check.is_premium()
+
+    verbose_proxy_logger.debug(
+        "litellm.proxy.proxy_server.py::startup() - PREMIUM USER value - {}".format(
+            premium_user
+        )
+    )
+
+    ProxyStartupEvent._initialize_startup_logging(
+        llm_router=llm_router,
+        proxy_logging_obj=proxy_logging_obj,
+        redis_usage_cache=redis_usage_cache,
+    )
+
+    ## JWT AUTH ##
+    ProxyStartupEvent._initialize_jwt_auth(
+        general_settings=general_settings,
+        prisma_client=prisma_client,
+        user_api_key_cache=user_api_key_cache,
+    )
+
+    if use_background_health_checks:
+        asyncio.create_task(
+            _run_background_health_check()
+        )  # start the background health check coroutine.
+
+    if prompt_injection_detection_obj is not None:  # [TODO] - REFACTOR THIS
+        prompt_injection_detection_obj.update_environment(router=llm_router)
+
+    verbose_proxy_logger.debug("prisma_client: %s", prisma_client)
+    if prisma_client is not None:
+        await prisma_client.connect()
+
+    if prisma_client is not None and master_key is not None:
+        ProxyStartupEvent._add_master_key_hash_to_db(
+            master_key=master_key,
+            prisma_client=prisma_client,
+            litellm_proxy_admin_name=litellm_proxy_admin_name,
+            general_settings=general_settings,
+        )
+
+    if prisma_client is not None and litellm.max_budget > 0:
+        ProxyStartupEvent._add_proxy_budget_to_db(
+            litellm_proxy_budget_name=litellm_proxy_admin_name
+        )
+
+    ### START BATCH WRITING DB + CHECKING NEW MODELS###
+    if prisma_client is not None:
+        await ProxyStartupEvent.initialize_scheduled_background_jobs(
+            general_settings=general_settings,
+            prisma_client=prisma_client,
+            proxy_budget_rescheduler_min_time=proxy_budget_rescheduler_min_time,
+            proxy_budget_rescheduler_max_time=proxy_budget_rescheduler_max_time,
+            proxy_batch_write_at=proxy_batch_write_at,
+            proxy_logging_obj=proxy_logging_obj,
+            store_model_in_db=store_model_in_db,
+        )
+

 #### API ENDPOINTS ####
@router.get(
@ -6327,11 +6405,7 @@ async def list_end_user(
        --header 'Authorization: Bearer sk-1234'
    ```
    """
-    from litellm.proxy.proxy_server import (
-        create_audit_log_for_update,
-        litellm_proxy_admin_name,
-        prisma_client,
-    )
+    from litellm.proxy.proxy_server import litellm_proxy_admin_name, prisma_client

    if (
        user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
@ -6362,38 +6436,6 @@ async def list_end_user(
    return returned_response


-async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
-    if premium_user is not True:
-        return
-
-    if litellm.store_audit_logs is not True:
-        return
-    if prisma_client is None:
-        raise Exception("prisma_client is None, no DB connected")
-
-    verbose_proxy_logger.debug("creating audit log for %s", request_data)
-
-    if isinstance(request_data.updated_values, dict):
-        request_data.updated_values = json.dumps(request_data.updated_values)
-
-    if isinstance(request_data.before_value, dict):
-        request_data.before_value = json.dumps(request_data.before_value)
-
-    _request_data = request_data.dict(exclude_none=True)
-
-    try:
-        await prisma_client.db.litellm_auditlog.create(
-            data={
-                **_request_data,  # type: ignore
-            }
-        )
-    except Exception as e:
-        # [Non-Blocking Exception. Do not allow blocking LLM API call]
-        verbose_proxy_logger.error(f"Failed Creating audit log {e}")
-
-    return
-
-
 #### BUDGET TABLE MANAGEMENT ####


--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -154,6 +154,8 @@ model LiteLLM_VerificationToken {
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
    budget_id String?
+    created_at      DateTime?               @default(now()) @map("created_at")
+    updated_at      DateTime?               @default(now()) @updatedAt @map("updated_at")
    litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
 }

--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -349,6 +349,31 @@ class ProxyLogging:
        )
        self.premium_user = premium_user

+    def startup_event(
+        self,
+        llm_router: Optional[litellm.Router],
+        redis_usage_cache: Optional[RedisCache],
+    ):
+        """Initialize logging and alerting on proxy startup"""
+        ## UPDATE SLACK ALERTING ##
+        self.slack_alerting_instance.update_values(llm_router=llm_router)
+
+        ## UPDATE INTERNAL USAGE CACHE ##
+        self.update_values(
+            redis_cache=redis_usage_cache
+        )  # used by parallel request limiter for rate limiting keys across instances
+
+        self._init_litellm_callbacks(
+            llm_router=llm_router
+        )  # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
+
+        if "daily_reports" in self.slack_alerting_instance.alert_types:
+            asyncio.create_task(
+                self.slack_alerting_instance._run_scheduled_daily_report(
+                    llm_router=llm_router
+                )
+            )  # RUN DAILY REPORT (if scheduled)
+
    def update_values(
        self,
        alerting: Optional[List] = None,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -63,10 +63,7 @@ from litellm.router_utils.batch_utils import (
    _get_router_metadata_variable_name,
    replace_model_in_jsonl,
 )
-from litellm.router_utils.client_initalization_utils import (
-    set_client,
-    should_initialize_sync_client,
-)
+from litellm.router_utils.client_initalization_utils import InitalizeOpenAISDKClient
 from litellm.router_utils.cooldown_cache import CooldownCache
 from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
 from litellm.router_utils.cooldown_handlers import (
@ -3951,7 +3948,7 @@ class Router:
            raise Exception(f"Unsupported provider - {custom_llm_provider}")

        # init OpenAI, Azure clients
-        set_client(
+        InitalizeOpenAISDKClient.set_client(
            litellm_router_instance=self, model=deployment.to_json(exclude_none=True)
        )

@ -4661,7 +4658,9 @@ class Router:
                    """
                    Re-initialize the client
                    """
-                    set_client(litellm_router_instance=self, model=deployment)
+                    InitalizeOpenAISDKClient.set_client(
+                        litellm_router_instance=self, model=deployment
+                    )
                    client = self.cache.get_cache(key=cache_key, local_only=True)
                return client
            else:
@ -4671,7 +4670,9 @@ class Router:
                    """
                    Re-initialize the client
                    """
-                    set_client(litellm_router_instance=self, model=deployment)
+                    InitalizeOpenAISDKClient.set_client(
+                        litellm_router_instance=self, model=deployment
+                    )
                    client = self.cache.get_cache(key=cache_key, local_only=True)
                return client
        else:
@ -4682,7 +4683,9 @@ class Router:
                    """
                    Re-initialize the client
                    """
-                    set_client(litellm_router_instance=self, model=deployment)
+                    InitalizeOpenAISDKClient.set_client(
+                        litellm_router_instance=self, model=deployment
+                    )
                    client = self.cache.get_cache(key=cache_key)
                return client
            else:
@ -4692,7 +4695,9 @@ class Router:
                    """
                    Re-initialize the client
                    """
-                    set_client(litellm_router_instance=self, model=deployment)
+                    InitalizeOpenAISDKClient.set_client(
+                        litellm_router_instance=self, model=deployment
+                    )
                    client = self.cache.get_cache(key=cache_key)
                return client

--- a/litellm/router_utils/client_initalization_utils.py
+++ b/litellm/router_utils/client_initalization_utils.py
@ -23,236 +23,227 @@ else:
    LitellmRouter = Any


-def should_initialize_sync_client(
-    litellm_router_instance: LitellmRouter,
-) -> bool:
-    """
-    Returns if Sync OpenAI, Azure Clients should be initialized.
+class InitalizeOpenAISDKClient:
+    @staticmethod
+    def should_initialize_sync_client(
+        litellm_router_instance: LitellmRouter,
+    ) -> bool:
+        """
+        Returns if Sync OpenAI, Azure Clients should be initialized.

-    Do not init sync clients when router.router_general_settings.async_only_mode is True
+        Do not init sync clients when router.router_general_settings.async_only_mode is True

-    """
-    if litellm_router_instance is None:
-        return False
-
-    if litellm_router_instance.router_general_settings is not None:
-        if (
-            hasattr(litellm_router_instance, "router_general_settings")
-            and hasattr(
-                litellm_router_instance.router_general_settings, "async_only_mode"
-            )
-            and litellm_router_instance.router_general_settings.async_only_mode is True
-        ):
+        """
+        if litellm_router_instance is None:
            return False

-    return True
+        if litellm_router_instance.router_general_settings is not None:
+            if (
+                hasattr(litellm_router_instance, "router_general_settings")
+                and hasattr(
+                    litellm_router_instance.router_general_settings, "async_only_mode"
+                )
+                and litellm_router_instance.router_general_settings.async_only_mode
+                is True
+            ):
+                return False

+        return True

-def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PLR0915
-    """
-    - Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
-    - Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
-    """
-    client_ttl = litellm_router_instance.client_ttl
-    litellm_params = model.get("litellm_params", {})
-    model_name = litellm_params.get("model")
-    model_id = model["model_info"]["id"]
-    # ### IF RPM SET - initialize a semaphore ###
-    rpm = litellm_params.get("rpm", None)
-    tpm = litellm_params.get("tpm", None)
-    max_parallel_requests = litellm_params.get("max_parallel_requests", None)
-    calculated_max_parallel_requests = calculate_max_parallel_requests(
-        rpm=rpm,
-        max_parallel_requests=max_parallel_requests,
-        tpm=tpm,
-        default_max_parallel_requests=litellm_router_instance.default_max_parallel_requests,
-    )
-    if calculated_max_parallel_requests:
-        semaphore = asyncio.Semaphore(calculated_max_parallel_requests)
-        cache_key = f"{model_id}_max_parallel_requests_client"
-        litellm_router_instance.cache.set_cache(
-            key=cache_key,
-            value=semaphore,
-            local_only=True,
-        )
-
-    ####  for OpenAI / Azure we need to initalize the Client for High Traffic ########
-    custom_llm_provider = litellm_params.get("custom_llm_provider")
-    custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
-    default_api_base = None
-    default_api_key = None
-    if custom_llm_provider in litellm.openai_compatible_providers:
-        _, custom_llm_provider, api_key, api_base = litellm.get_llm_provider(
-            model=model_name
-        )
-        default_api_base = api_base
-        default_api_key = api_key
-
-    if (
-        model_name in litellm.open_ai_chat_completion_models
-        or custom_llm_provider in litellm.openai_compatible_providers
-        or custom_llm_provider == "azure"
-        or custom_llm_provider == "azure_text"
-        or custom_llm_provider == "custom_openai"
-        or custom_llm_provider == "openai"
-        or custom_llm_provider == "text-completion-openai"
-        or "ft:gpt-3.5-turbo" in model_name
-        or model_name in litellm.open_ai_embedding_models
+    @staticmethod
+    def set_client(  # noqa: PLR0915
+        litellm_router_instance: LitellmRouter, model: dict
    ):
-        is_azure_ai_studio_model: bool = False
-        if custom_llm_provider == "azure":
-            if litellm.utils._is_non_openai_azure_model(model_name):
-                is_azure_ai_studio_model = True
-                custom_llm_provider = "openai"
-                # remove azure prefx from model_name
-                model_name = model_name.replace("azure/", "")
-        # glorified / complicated reading of configs
-        # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
-        # we do this here because we init clients for Azure, OpenAI and we need to set the right key
-        api_key = litellm_params.get("api_key") or default_api_key
-        if api_key and isinstance(api_key, str) and api_key.startswith("os.environ/"):
-            api_key_env_name = api_key.replace("os.environ/", "")
-            api_key = get_secret_str(api_key_env_name)
-            litellm_params["api_key"] = api_key
-
-        api_base = litellm_params.get("api_base")
-        base_url: Optional[str] = litellm_params.get("base_url")
-        api_base = (
-            api_base or base_url or default_api_base
-        )  # allow users to pass in `api_base` or `base_url` for azure
-        if api_base and api_base.startswith("os.environ/"):
-            api_base_env_name = api_base.replace("os.environ/", "")
-            api_base = get_secret_str(api_base_env_name)
-            litellm_params["api_base"] = api_base
-
-        ## AZURE AI STUDIO MISTRAL CHECK ##
        """
-        Make sure api base ends in /v1/
-
-        if not, add it - https://github.com/BerriAI/litellm/issues/2279
+        - Initializes Azure/OpenAI clients. Stores them in cache, b/c of this - https://github.com/BerriAI/litellm/issues/1278
+        - Initializes Semaphore for client w/ rpm. Stores them in cache. b/c of this - https://github.com/BerriAI/litellm/issues/2994
        """
-        if (
-            is_azure_ai_studio_model is True
-            and api_base is not None
-            and isinstance(api_base, str)
-            and not api_base.endswith("/v1/")
-        ):
-            # check if it ends with a trailing slash
-            if api_base.endswith("/"):
-                api_base += "v1/"
-            elif api_base.endswith("/v1"):
-                api_base += "/"
-            else:
-                api_base += "/v1/"
-
-        api_version = litellm_params.get("api_version")
-        if api_version and api_version.startswith("os.environ/"):
-            api_version_env_name = api_version.replace("os.environ/", "")
-            api_version = get_secret_str(api_version_env_name)
-            litellm_params["api_version"] = api_version
-
-        timeout: Optional[float] = (
-            litellm_params.pop("timeout", None) or litellm.request_timeout
+        client_ttl = litellm_router_instance.client_ttl
+        litellm_params = model.get("litellm_params", {})
+        model_name = litellm_params.get("model")
+        model_id = model["model_info"]["id"]
+        # ### IF RPM SET - initialize a semaphore ###
+        rpm = litellm_params.get("rpm", None)
+        tpm = litellm_params.get("tpm", None)
+        max_parallel_requests = litellm_params.get("max_parallel_requests", None)
+        calculated_max_parallel_requests = calculate_max_parallel_requests(
+            rpm=rpm,
+            max_parallel_requests=max_parallel_requests,
+            tpm=tpm,
+            default_max_parallel_requests=litellm_router_instance.default_max_parallel_requests,
        )
-        if isinstance(timeout, str) and timeout.startswith("os.environ/"):
-            timeout_env_name = timeout.replace("os.environ/", "")
-            timeout = get_secret(timeout_env_name)  # type: ignore
-            litellm_params["timeout"] = timeout
-
-        stream_timeout: Optional[float] = litellm_params.pop(
-            "stream_timeout", timeout
-        )  # if no stream_timeout is set, default to timeout
-        if isinstance(stream_timeout, str) and stream_timeout.startswith("os.environ/"):
-            stream_timeout_env_name = stream_timeout.replace("os.environ/", "")
-            stream_timeout = get_secret(stream_timeout_env_name)  # type: ignore
-            litellm_params["stream_timeout"] = stream_timeout
-
-        max_retries: Optional[int] = litellm_params.pop(
-            "max_retries", 0
-        )  # router handles retry logic
-        if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
-            max_retries_env_name = max_retries.replace("os.environ/", "")
-            max_retries = get_secret(max_retries_env_name)  # type: ignore
-            litellm_params["max_retries"] = max_retries
-
-        organization = litellm_params.get("organization", None)
-        if isinstance(organization, str) and organization.startswith("os.environ/"):
-            organization_env_name = organization.replace("os.environ/", "")
-            organization = get_secret_str(organization_env_name)
-            litellm_params["organization"] = organization
-        azure_ad_token_provider: Optional[Callable[[], str]] = None
-        if litellm_params.get("tenant_id"):
-            verbose_router_logger.debug("Using Azure AD Token Provider for Azure Auth")
-            azure_ad_token_provider = get_azure_ad_token_from_entrata_id(
-                tenant_id=litellm_params.get("tenant_id"),
-                client_id=litellm_params.get("client_id"),
-                client_secret=litellm_params.get("client_secret"),
+        if calculated_max_parallel_requests:
+            semaphore = asyncio.Semaphore(calculated_max_parallel_requests)
+            cache_key = f"{model_id}_max_parallel_requests_client"
+            litellm_router_instance.cache.set_cache(
+                key=cache_key,
+                value=semaphore,
+                local_only=True,
            )

-        if custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
-            if api_base is None or not isinstance(api_base, str):
-                filtered_litellm_params = {
-                    k: v for k, v in model["litellm_params"].items() if k != "api_key"
-                }
-                _filtered_model = {
-                    "model_name": model["model_name"],
-                    "litellm_params": filtered_litellm_params,
-                }
-                raise ValueError(
-                    f"api_base is required for Azure OpenAI. Set it on your config. Model - {_filtered_model}"
-                )
-            azure_ad_token = litellm_params.get("azure_ad_token")
-            if azure_ad_token is not None:
-                if azure_ad_token.startswith("oidc/"):
-                    azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
-            elif (
-                azure_ad_token_provider is None
-                and litellm.enable_azure_ad_token_refresh is True
+        ####  for OpenAI / Azure we need to initalize the Client for High Traffic ########
+        custom_llm_provider = litellm_params.get("custom_llm_provider")
+        custom_llm_provider = custom_llm_provider or model_name.split("/", 1)[0] or ""
+        default_api_base = None
+        default_api_key = None
+        if custom_llm_provider in litellm.openai_compatible_providers:
+            _, custom_llm_provider, api_key, api_base = litellm.get_llm_provider(
+                model=model_name
+            )
+            default_api_base = api_base
+            default_api_key = api_key
+
+        if (
+            model_name in litellm.open_ai_chat_completion_models
+            or custom_llm_provider in litellm.openai_compatible_providers
+            or custom_llm_provider == "azure"
+            or custom_llm_provider == "azure_text"
+            or custom_llm_provider == "custom_openai"
+            or custom_llm_provider == "openai"
+            or custom_llm_provider == "text-completion-openai"
+            or "ft:gpt-3.5-turbo" in model_name
+            or model_name in litellm.open_ai_embedding_models
+        ):
+            is_azure_ai_studio_model: bool = False
+            if custom_llm_provider == "azure":
+                if litellm.utils._is_non_openai_azure_model(model_name):
+                    is_azure_ai_studio_model = True
+                    custom_llm_provider = "openai"
+                    # remove azure prefx from model_name
+                    model_name = model_name.replace("azure/", "")
+            # glorified / complicated reading of configs
+            # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
+            # we do this here because we init clients for Azure, OpenAI and we need to set the right key
+            api_key = litellm_params.get("api_key") or default_api_key
+            if (
+                api_key
+                and isinstance(api_key, str)
+                and api_key.startswith("os.environ/")
            ):
-                try:
-                    azure_ad_token_provider = get_azure_ad_token_provider()
-                except ValueError:
-                    verbose_router_logger.debug(
-                        "Azure AD Token Provider could not be used."
-                    )
-            if api_version is None:
-                api_version = os.getenv(
-                    "AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
-                )
+                api_key_env_name = api_key.replace("os.environ/", "")
+                api_key = get_secret_str(api_key_env_name)
+                litellm_params["api_key"] = api_key

-            if "gateway.ai.cloudflare.com" in api_base:
-                if not api_base.endswith("/"):
+            api_base = litellm_params.get("api_base")
+            base_url: Optional[str] = litellm_params.get("base_url")
+            api_base = (
+                api_base or base_url or default_api_base
+            )  # allow users to pass in `api_base` or `base_url` for azure
+            if api_base and api_base.startswith("os.environ/"):
+                api_base_env_name = api_base.replace("os.environ/", "")
+                api_base = get_secret_str(api_base_env_name)
+                litellm_params["api_base"] = api_base
+
+            ## AZURE AI STUDIO MISTRAL CHECK ##
+            """
+            Make sure api base ends in /v1/
+
+            if not, add it - https://github.com/BerriAI/litellm/issues/2279
+            """
+            if (
+                is_azure_ai_studio_model is True
+                and api_base is not None
+                and isinstance(api_base, str)
+                and not api_base.endswith("/v1/")
+            ):
+                # check if it ends with a trailing slash
+                if api_base.endswith("/"):
+                    api_base += "v1/"
+                elif api_base.endswith("/v1"):
                    api_base += "/"
-                azure_model = model_name.replace("azure/", "")
-                api_base += f"{azure_model}"
-                cache_key = f"{model_id}_async_client"
-                _client = openai.AsyncAzureOpenAI(
-                    api_key=api_key,
-                    azure_ad_token=azure_ad_token,
-                    azure_ad_token_provider=azure_ad_token_provider,
-                    base_url=api_base,
-                    api_version=api_version,
-                    timeout=timeout,  # type: ignore
-                    max_retries=max_retries,  # type: ignore
-                    http_client=httpx.AsyncClient(
-                        limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
-                        ),
-                        verify=litellm.ssl_verify,
-                    ),  # type: ignore
-                )
-                litellm_router_instance.cache.set_cache(
-                    key=cache_key,
-                    value=_client,
-                    ttl=client_ttl,
-                    local_only=True,
-                )  # cache for 1 hr
+                else:
+                    api_base += "/v1/"

-                if should_initialize_sync_client(
-                    litellm_router_instance=litellm_router_instance
+            api_version = litellm_params.get("api_version")
+            if api_version and api_version.startswith("os.environ/"):
+                api_version_env_name = api_version.replace("os.environ/", "")
+                api_version = get_secret_str(api_version_env_name)
+                litellm_params["api_version"] = api_version
+
+            timeout: Optional[float] = (
+                litellm_params.pop("timeout", None) or litellm.request_timeout
+            )
+            if isinstance(timeout, str) and timeout.startswith("os.environ/"):
+                timeout_env_name = timeout.replace("os.environ/", "")
+                timeout = get_secret(timeout_env_name)  # type: ignore
+                litellm_params["timeout"] = timeout
+
+            stream_timeout: Optional[float] = litellm_params.pop(
+                "stream_timeout", timeout
+            )  # if no stream_timeout is set, default to timeout
+            if isinstance(stream_timeout, str) and stream_timeout.startswith(
+                "os.environ/"
+            ):
+                stream_timeout_env_name = stream_timeout.replace("os.environ/", "")
+                stream_timeout = get_secret(stream_timeout_env_name)  # type: ignore
+                litellm_params["stream_timeout"] = stream_timeout
+
+            max_retries: Optional[int] = litellm_params.pop(
+                "max_retries", 0
+            )  # router handles retry logic
+            if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
+                max_retries_env_name = max_retries.replace("os.environ/", "")
+                max_retries = get_secret(max_retries_env_name)  # type: ignore
+                litellm_params["max_retries"] = max_retries
+
+            organization = litellm_params.get("organization", None)
+            if isinstance(organization, str) and organization.startswith("os.environ/"):
+                organization_env_name = organization.replace("os.environ/", "")
+                organization = get_secret_str(organization_env_name)
+                litellm_params["organization"] = organization
+            azure_ad_token_provider: Optional[Callable[[], str]] = None
+            if litellm_params.get("tenant_id"):
+                verbose_router_logger.debug(
+                    "Using Azure AD Token Provider for Azure Auth"
+                )
+                azure_ad_token_provider = (
+                    InitalizeOpenAISDKClient.get_azure_ad_token_from_entrata_id(
+                        tenant_id=litellm_params.get("tenant_id"),
+                        client_id=litellm_params.get("client_id"),
+                        client_secret=litellm_params.get("client_secret"),
+                    )
+                )
+
+            if custom_llm_provider == "azure" or custom_llm_provider == "azure_text":
+                if api_base is None or not isinstance(api_base, str):
+                    filtered_litellm_params = {
+                        k: v
+                        for k, v in model["litellm_params"].items()
+                        if k != "api_key"
+                    }
+                    _filtered_model = {
+                        "model_name": model["model_name"],
+                        "litellm_params": filtered_litellm_params,
+                    }
+                    raise ValueError(
+                        f"api_base is required for Azure OpenAI. Set it on your config. Model - {_filtered_model}"
+                    )
+                azure_ad_token = litellm_params.get("azure_ad_token")
+                if azure_ad_token is not None:
+                    if azure_ad_token.startswith("oidc/"):
+                        azure_ad_token = get_azure_ad_token_from_oidc(azure_ad_token)
+                elif (
+                    azure_ad_token_provider is None
+                    and litellm.enable_azure_ad_token_refresh is True
                ):
-                    cache_key = f"{model_id}_client"
-                    _client = openai.AzureOpenAI(  # type: ignore
+                    try:
+                        azure_ad_token_provider = get_azure_ad_token_provider()
+                    except ValueError:
+                        verbose_router_logger.debug(
+                            "Azure AD Token Provider could not be used."
+                        )
+                if api_version is None:
+                    api_version = os.getenv(
+                        "AZURE_API_VERSION", litellm.AZURE_DEFAULT_API_VERSION
+                    )
+
+                if "gateway.ai.cloudflare.com" in api_base:
+                    if not api_base.endswith("/"):
+                        api_base += "/"
+                    azure_model = model_name.replace("azure/", "")
+                    api_base += f"{azure_model}"
+                    cache_key = f"{model_id}_async_client"
+                    _client = openai.AsyncAzureOpenAI(
                        api_key=api_key,
                        azure_ad_token=azure_ad_token,
                        azure_ad_token_provider=azure_ad_token_provider,
@ -260,7 +251,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                        api_version=api_version,
                        timeout=timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
-                        http_client=httpx.Client(
+                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
@ -273,35 +264,35 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
-                # streaming clients can have diff timeouts
-                cache_key = f"{model_id}_stream_async_client"
-                _client = openai.AsyncAzureOpenAI(  # type: ignore
-                    api_key=api_key,
-                    azure_ad_token=azure_ad_token,
-                    azure_ad_token_provider=azure_ad_token_provider,
-                    base_url=api_base,
-                    api_version=api_version,
-                    timeout=stream_timeout,  # type: ignore
-                    max_retries=max_retries,  # type: ignore
-                    http_client=httpx.AsyncClient(
-                        limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
-                        ),
-                        verify=litellm.ssl_verify,
-                    ),  # type: ignore
-                )
-                litellm_router_instance.cache.set_cache(
-                    key=cache_key,
-                    value=_client,
-                    ttl=client_ttl,
-                    local_only=True,
-                )  # cache for 1 hr

-                if should_initialize_sync_client(
-                    litellm_router_instance=litellm_router_instance
-                ):
-                    cache_key = f"{model_id}_stream_client"
-                    _client = openai.AzureOpenAI(  # type: ignore
+                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
+                        litellm_router_instance=litellm_router_instance
+                    ):
+                        cache_key = f"{model_id}_client"
+                        _client = openai.AzureOpenAI(  # type: ignore
+                            api_key=api_key,
+                            azure_ad_token=azure_ad_token,
+                            azure_ad_token_provider=azure_ad_token_provider,
+                            base_url=api_base,
+                            api_version=api_version,
+                            timeout=timeout,  # type: ignore
+                            max_retries=max_retries,  # type: ignore
+                            http_client=httpx.Client(
+                                limits=httpx.Limits(
+                                    max_connections=1000, max_keepalive_connections=100
+                                ),
+                                verify=litellm.ssl_verify,
+                            ),  # type: ignore
+                        )
+                        litellm_router_instance.cache.set_cache(
+                            key=cache_key,
+                            value=_client,
+                            ttl=client_ttl,
+                            local_only=True,
+                        )  # cache for 1 hr
+                    # streaming clients can have diff timeouts
+                    cache_key = f"{model_id}_stream_async_client"
+                    _client = openai.AsyncAzureOpenAI(  # type: ignore
                        api_key=api_key,
                        azure_ad_token=azure_ad_token,
                        azure_ad_token_provider=azure_ad_token_provider,
@ -309,7 +300,7 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                        api_version=api_version,
                        timeout=stream_timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
-                        http_client=httpx.Client(
+                        http_client=httpx.AsyncClient(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
@ -322,41 +313,159 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                        ttl=client_ttl,
                        local_only=True,
                    )  # cache for 1 hr
+
+                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
+                        litellm_router_instance=litellm_router_instance
+                    ):
+                        cache_key = f"{model_id}_stream_client"
+                        _client = openai.AzureOpenAI(  # type: ignore
+                            api_key=api_key,
+                            azure_ad_token=azure_ad_token,
+                            azure_ad_token_provider=azure_ad_token_provider,
+                            base_url=api_base,
+                            api_version=api_version,
+                            timeout=stream_timeout,  # type: ignore
+                            max_retries=max_retries,  # type: ignore
+                            http_client=httpx.Client(
+                                limits=httpx.Limits(
+                                    max_connections=1000, max_keepalive_connections=100
+                                ),
+                                verify=litellm.ssl_verify,
+                            ),  # type: ignore
+                        )
+                        litellm_router_instance.cache.set_cache(
+                            key=cache_key,
+                            value=_client,
+                            ttl=client_ttl,
+                            local_only=True,
+                        )  # cache for 1 hr
+                else:
+                    _api_key = api_key
+                    if _api_key is not None and isinstance(_api_key, str):
+                        # only show first 5 chars of api_key
+                        _api_key = _api_key[:8] + "*" * 15
+                    verbose_router_logger.debug(
+                        f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
+                    )
+                    azure_client_params = {
+                        "api_key": api_key,
+                        "azure_endpoint": api_base,
+                        "api_version": api_version,
+                        "azure_ad_token": azure_ad_token,
+                        "azure_ad_token_provider": azure_ad_token_provider,
+                    }
+
+                    if azure_ad_token_provider is not None:
+                        azure_client_params["azure_ad_token_provider"] = (
+                            azure_ad_token_provider
+                        )
+                    from litellm.llms.AzureOpenAI.azure import (
+                        select_azure_base_url_or_endpoint,
+                    )
+
+                    # this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
+                    # required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
+                    azure_client_params = select_azure_base_url_or_endpoint(
+                        azure_client_params
+                    )
+
+                    cache_key = f"{model_id}_async_client"
+                    _client = openai.AsyncAzureOpenAI(  # type: ignore
+                        **azure_client_params,
+                        timeout=timeout,  # type: ignore
+                        max_retries=max_retries,  # type: ignore
+                        http_client=httpx.AsyncClient(
+                            limits=httpx.Limits(
+                                max_connections=1000, max_keepalive_connections=100
+                            ),
+                            verify=litellm.ssl_verify,
+                        ),  # type: ignore
+                    )
+                    litellm_router_instance.cache.set_cache(
+                        key=cache_key,
+                        value=_client,
+                        ttl=client_ttl,
+                        local_only=True,
+                    )  # cache for 1 hr
+                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
+                        litellm_router_instance=litellm_router_instance
+                    ):
+                        cache_key = f"{model_id}_client"
+                        _client = openai.AzureOpenAI(  # type: ignore
+                            **azure_client_params,
+                            timeout=timeout,  # type: ignore
+                            max_retries=max_retries,  # type: ignore
+                            http_client=httpx.Client(
+                                limits=httpx.Limits(
+                                    max_connections=1000, max_keepalive_connections=100
+                                ),
+                                verify=litellm.ssl_verify,
+                            ),  # type: ignore
+                        )
+                        litellm_router_instance.cache.set_cache(
+                            key=cache_key,
+                            value=_client,
+                            ttl=client_ttl,
+                            local_only=True,
+                        )  # cache for 1 hr
+
+                    # streaming clients should have diff timeouts
+                    cache_key = f"{model_id}_stream_async_client"
+                    _client = openai.AsyncAzureOpenAI(  # type: ignore
+                        **azure_client_params,
+                        timeout=stream_timeout,  # type: ignore
+                        max_retries=max_retries,  # type: ignore
+                        http_client=httpx.AsyncClient(
+                            limits=httpx.Limits(
+                                max_connections=1000, max_keepalive_connections=100
+                            ),
+                            verify=litellm.ssl_verify,
+                        ),
+                    )
+                    litellm_router_instance.cache.set_cache(
+                        key=cache_key,
+                        value=_client,
+                        ttl=client_ttl,
+                        local_only=True,
+                    )  # cache for 1 hr
+
+                    if InitalizeOpenAISDKClient.should_initialize_sync_client(
+                        litellm_router_instance=litellm_router_instance
+                    ):
+                        cache_key = f"{model_id}_stream_client"
+                        _client = openai.AzureOpenAI(  # type: ignore
+                            **azure_client_params,
+                            timeout=stream_timeout,  # type: ignore
+                            max_retries=max_retries,  # type: ignore
+                            http_client=httpx.Client(
+                                limits=httpx.Limits(
+                                    max_connections=1000, max_keepalive_connections=100
+                                ),
+                                verify=litellm.ssl_verify,
+                            ),
+                        )
+                        litellm_router_instance.cache.set_cache(
+                            key=cache_key,
+                            value=_client,
+                            ttl=client_ttl,
+                            local_only=True,
+                        )  # cache for 1 hr
+
            else:
-                _api_key = api_key
+                _api_key = api_key  # type: ignore
                if _api_key is not None and isinstance(_api_key, str):
                    # only show first 5 chars of api_key
                    _api_key = _api_key[:8] + "*" * 15
                verbose_router_logger.debug(
-                    f"Initializing Azure OpenAI Client for {model_name}, Api Base: {str(api_base)}, Api Key:{_api_key}"
+                    f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
                )
-                azure_client_params = {
-                    "api_key": api_key,
-                    "azure_endpoint": api_base,
-                    "api_version": api_version,
-                    "azure_ad_token": azure_ad_token,
-                    "azure_ad_token_provider": azure_ad_token_provider,
-                }
-
-                if azure_ad_token_provider is not None:
-                    azure_client_params["azure_ad_token_provider"] = (
-                        azure_ad_token_provider
-                    )
-                from litellm.llms.AzureOpenAI.azure import (
-                    select_azure_base_url_or_endpoint,
-                )
-
-                # this decides if we should set azure_endpoint or base_url on Azure OpenAI Client
-                # required to support GPT-4 vision enhancements, since base_url needs to be set on Azure OpenAI Client
-                azure_client_params = select_azure_base_url_or_endpoint(
-                    azure_client_params
-                )
-
                cache_key = f"{model_id}_async_client"
-                _client = openai.AsyncAzureOpenAI(  # type: ignore
-                    **azure_client_params,
+                _client = openai.AsyncOpenAI(  # type: ignore
+                    api_key=api_key,
+                    base_url=api_base,
                    timeout=timeout,  # type: ignore
                    max_retries=max_retries,  # type: ignore
+                    organization=organization,
                    http_client=httpx.AsyncClient(
                        limits=httpx.Limits(
                            max_connections=1000, max_keepalive_connections=100
@ -370,14 +479,17 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                    ttl=client_ttl,
                    local_only=True,
                )  # cache for 1 hr
-                if should_initialize_sync_client(
+
+                if InitalizeOpenAISDKClient.should_initialize_sync_client(
                    litellm_router_instance=litellm_router_instance
                ):
                    cache_key = f"{model_id}_client"
-                    _client = openai.AzureOpenAI(  # type: ignore
-                        **azure_client_params,
+                    _client = openai.OpenAI(  # type: ignore
+                        api_key=api_key,
+                        base_url=api_base,
                        timeout=timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
+                        organization=organization,
                        http_client=httpx.Client(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
@ -394,16 +506,18 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL

                # streaming clients should have diff timeouts
                cache_key = f"{model_id}_stream_async_client"
-                _client = openai.AsyncAzureOpenAI(  # type: ignore
-                    **azure_client_params,
+                _client = openai.AsyncOpenAI(  # type: ignore
+                    api_key=api_key,
+                    base_url=api_base,
                    timeout=stream_timeout,  # type: ignore
                    max_retries=max_retries,  # type: ignore
+                    organization=organization,
                    http_client=httpx.AsyncClient(
                        limits=httpx.Limits(
                            max_connections=1000, max_keepalive_connections=100
                        ),
                        verify=litellm.ssl_verify,
-                    ),
+                    ),  # type: ignore
                )
                litellm_router_instance.cache.set_cache(
                    key=cache_key,
@ -412,20 +526,23 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                    local_only=True,
                )  # cache for 1 hr

-                if should_initialize_sync_client(
+                if InitalizeOpenAISDKClient.should_initialize_sync_client(
                    litellm_router_instance=litellm_router_instance
                ):
+                    # streaming clients should have diff timeouts
                    cache_key = f"{model_id}_stream_client"
-                    _client = openai.AzureOpenAI(  # type: ignore
-                        **azure_client_params,
+                    _client = openai.OpenAI(  # type: ignore
+                        api_key=api_key,
+                        base_url=api_base,
                        timeout=stream_timeout,  # type: ignore
                        max_retries=max_retries,  # type: ignore
+                        organization=organization,
                        http_client=httpx.Client(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
-                        ),
+                        ),  # type: ignore
                    )
                    litellm_router_instance.cache.set_cache(
                        key=cache_key,
@ -434,149 +551,49 @@ def set_client(litellm_router_instance: LitellmRouter, model: dict):  # noqa: PL
                        local_only=True,
                    )  # cache for 1 hr

+    @staticmethod
+    def get_azure_ad_token_from_entrata_id(
+        tenant_id: str, client_id: str, client_secret: str
+    ) -> Callable[[], str]:
+        from azure.identity import (
+            ClientSecretCredential,
+            DefaultAzureCredential,
+            get_bearer_token_provider,
+        )
+
+        verbose_router_logger.debug("Getting Azure AD Token from Entrata ID")
+
+        if tenant_id.startswith("os.environ/"):
+            _tenant_id = get_secret_str(tenant_id)
        else:
-            _api_key = api_key  # type: ignore
-            if _api_key is not None and isinstance(_api_key, str):
-                # only show first 5 chars of api_key
-                _api_key = _api_key[:8] + "*" * 15
-            verbose_router_logger.debug(
-                f"Initializing OpenAI Client for {model_name}, Api Base:{str(api_base)}, Api Key:{_api_key}"
-            )
-            cache_key = f"{model_id}_async_client"
-            _client = openai.AsyncOpenAI(  # type: ignore
-                api_key=api_key,
-                base_url=api_base,
-                timeout=timeout,  # type: ignore
-                max_retries=max_retries,  # type: ignore
-                organization=organization,
-                http_client=httpx.AsyncClient(
-                    limits=httpx.Limits(
-                        max_connections=1000, max_keepalive_connections=100
-                    ),
-                    verify=litellm.ssl_verify,
-                ),  # type: ignore
-            )
-            litellm_router_instance.cache.set_cache(
-                key=cache_key,
-                value=_client,
-                ttl=client_ttl,
-                local_only=True,
-            )  # cache for 1 hr
+            _tenant_id = tenant_id

-            if should_initialize_sync_client(
-                litellm_router_instance=litellm_router_instance
-            ):
-                cache_key = f"{model_id}_client"
-                _client = openai.OpenAI(  # type: ignore
-                    api_key=api_key,
-                    base_url=api_base,
-                    timeout=timeout,  # type: ignore
-                    max_retries=max_retries,  # type: ignore
-                    organization=organization,
-                    http_client=httpx.Client(
-                        limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
-                        ),
-                        verify=litellm.ssl_verify,
-                    ),  # type: ignore
-                )
-                litellm_router_instance.cache.set_cache(
-                    key=cache_key,
-                    value=_client,
-                    ttl=client_ttl,
-                    local_only=True,
-                )  # cache for 1 hr
+        if client_id.startswith("os.environ/"):
+            _client_id = get_secret_str(client_id)
+        else:
+            _client_id = client_id

-            # streaming clients should have diff timeouts
-            cache_key = f"{model_id}_stream_async_client"
-            _client = openai.AsyncOpenAI(  # type: ignore
-                api_key=api_key,
-                base_url=api_base,
-                timeout=stream_timeout,  # type: ignore
-                max_retries=max_retries,  # type: ignore
-                organization=organization,
-                http_client=httpx.AsyncClient(
-                    limits=httpx.Limits(
-                        max_connections=1000, max_keepalive_connections=100
-                    ),
-                    verify=litellm.ssl_verify,
-                ),  # type: ignore
-            )
-            litellm_router_instance.cache.set_cache(
-                key=cache_key,
-                value=_client,
-                ttl=client_ttl,
-                local_only=True,
-            )  # cache for 1 hr
+        if client_secret.startswith("os.environ/"):
+            _client_secret = get_secret_str(client_secret)
+        else:
+            _client_secret = client_secret

-            if should_initialize_sync_client(
-                litellm_router_instance=litellm_router_instance
-            ):
-                # streaming clients should have diff timeouts
-                cache_key = f"{model_id}_stream_client"
-                _client = openai.OpenAI(  # type: ignore
-                    api_key=api_key,
-                    base_url=api_base,
-                    timeout=stream_timeout,  # type: ignore
-                    max_retries=max_retries,  # type: ignore
-                    organization=organization,
-                    http_client=httpx.Client(
-                        limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
-                        ),
-                        verify=litellm.ssl_verify,
-                    ),  # type: ignore
-                )
-                litellm_router_instance.cache.set_cache(
-                    key=cache_key,
-                    value=_client,
-                    ttl=client_ttl,
-                    local_only=True,
-                )  # cache for 1 hr
+        verbose_router_logger.debug(
+            "tenant_id %s, client_id %s, client_secret %s",
+            _tenant_id,
+            _client_id,
+            _client_secret,
+        )
+        if _tenant_id is None or _client_id is None or _client_secret is None:
+            raise ValueError("tenant_id, client_id, and client_secret must be provided")
+        credential = ClientSecretCredential(_tenant_id, _client_id, _client_secret)

+        verbose_router_logger.debug("credential %s", credential)

-def get_azure_ad_token_from_entrata_id(
-    tenant_id: str, client_id: str, client_secret: str
-) -> Callable[[], str]:
-    from azure.identity import (
-        ClientSecretCredential,
-        DefaultAzureCredential,
-        get_bearer_token_provider,
-    )
+        token_provider = get_bearer_token_provider(
+            credential, "https://cognitiveservices.azure.com/.default"
+        )

-    verbose_router_logger.debug("Getting Azure AD Token from Entrata ID")
+        verbose_router_logger.debug("token_provider %s", token_provider)

-    if tenant_id.startswith("os.environ/"):
-        _tenant_id = get_secret_str(tenant_id)
-    else:
-        _tenant_id = tenant_id
-
-    if client_id.startswith("os.environ/"):
-        _client_id = get_secret_str(client_id)
-    else:
-        _client_id = client_id
-
-    if client_secret.startswith("os.environ/"):
-        _client_secret = get_secret_str(client_secret)
-    else:
-        _client_secret = client_secret
-
-    verbose_router_logger.debug(
-        "tenant_id %s, client_id %s, client_secret %s",
-        _tenant_id,
-        _client_id,
-        _client_secret,
-    )
-    if _tenant_id is None or _client_id is None or _client_secret is None:
-        raise ValueError("tenant_id, client_id, and client_secret must be provided")
-    credential = ClientSecretCredential(_tenant_id, _client_id, _client_secret)
-
-    verbose_router_logger.debug("credential %s", credential)
-
-    token_provider = get_bearer_token_provider(
-        credential, "https://cognitiveservices.azure.com/.default"
-    )
-
-    verbose_router_logger.debug("token_provider %s", token_provider)
-
-    return token_provider
+        return token_provider
--- a/litellm/router_utils/pattern_match_deployments.py
+++ b/litellm/router_utils/pattern_match_deployments.py
@ -4,6 +4,7 @@ Class to handle llm wildcard routing and regex pattern matching

 import copy
 import re
+from re import Match
 from typing import Dict, List, Optional

 from litellm import get_llm_provider
@ -53,11 +54,12 @@ class PatternMatchRouter:
        Returns:
            str: regex pattern
        """
-        # Replace '*' with '.*' for regex matching
-        regex = pattern.replace("*", ".*")
-        # Escape other special characters
-        regex = re.escape(regex).replace(r"\.\*", ".*")
-        return f"^{regex}$"
+        # # Replace '*' with '.*' for regex matching
+        # regex = pattern.replace("*", ".*")
+        # # Escape other special characters
+        # regex = re.escape(regex).replace(r"\.\*", ".*")
+        # return f"^{regex}$"
+        return re.escape(pattern).replace(r"\*", "(.*)")

    def route(self, request: Optional[str]) -> Optional[List[Dict]]:
        """
@ -84,6 +86,44 @@ class PatternMatchRouter:

        return None  # No matching pattern found

+    @staticmethod
+    def set_deployment_model_name(
+        matched_pattern: Match,
+        litellm_deployment_litellm_model: str,
+    ) -> str:
+        """
+        Set the model name for the matched pattern llm deployment
+
+        E.g.:
+
+        model_name: llmengine/* (can be any regex pattern or wildcard pattern)
+        litellm_params:
+            model: openai/*
+
+        if model_name = "llmengine/foo" -> model = "openai/foo"
+        """
+        ## BASE CASE: if the deployment model name does not contain a wildcard, return the deployment model name
+        if "*" not in litellm_deployment_litellm_model:
+            return litellm_deployment_litellm_model
+
+        wildcard_count = litellm_deployment_litellm_model.count("*")
+
+        # Extract all dynamic segments from the request
+        dynamic_segments = matched_pattern.groups()
+
+        if len(dynamic_segments) > wildcard_count:
+            raise ValueError(
+                f"More wildcards in the deployment model name than the pattern. Wildcard count: {wildcard_count}, dynamic segments count: {len(dynamic_segments)}"
+            )
+
+        # Replace the corresponding wildcards in the litellm model pattern with extracted segments
+        for segment in dynamic_segments:
+            litellm_deployment_litellm_model = litellm_deployment_litellm_model.replace(
+                "*", segment, 1
+            )
+
+        return litellm_deployment_litellm_model
+
    def get_pattern(
        self, model: str, custom_llm_provider: Optional[str] = None
    ) -> Optional[List[Dict]]:
--- a/litellm/types/integrations/arize.py
+++ b/litellm/types/integrations/arize.py
@ -0,0 +1,10 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class ArizeConfig(BaseModel):
+    space_key: str
+    api_key: str
+    grpc_endpoint: Optional[str] = None
+    http_endpoint: Optional[str] = None
--- a/litellm/types/integrations/datadog_llm_obs.py
+++ b/litellm/types/integrations/datadog_llm_obs.py
@ -0,0 +1,52 @@
+"""
+Payloads for Datadog LLM Observability Service (LLMObs)
+
+API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
+"""
+
+from typing import Any, List, Literal, Optional, TypedDict
+
+
+class InputMeta(TypedDict):
+    messages: List[Any]
+
+
+class OutputMeta(TypedDict):
+    messages: List[Any]
+
+
+class Meta(TypedDict):
+    # The span kind: "agent", "workflow", "llm", "tool", "task", "embedding", or "retrieval".
+    kind: Literal["llm", "tool", "task", "embedding", "retrieval"]
+    input: InputMeta  # The span’s input information.
+    output: OutputMeta  # The span’s output information.
+
+
+class LLMMetrics(TypedDict, total=False):
+    input_tokens: float
+    output_tokens: float
+    total_tokens: float
+    time_to_first_token: float
+    time_per_output_token: float
+
+
+class LLMObsPayload(TypedDict):
+    parent_id: str
+    trace_id: str
+    span_id: str
+    name: str
+    meta: Meta
+    start_ns: int
+    duration: int
+    metrics: LLMMetrics
+
+
+class DDSpanAttributes(TypedDict):
+    ml_app: str
+    tags: List[str]
+    spans: List[LLMObsPayload]
+
+
+class DDIntakePayload(TypedDict):
+    type: str
+    attributes: DDSpanAttributes
--- a/litellm/types/integrations/langfuse.py
+++ b/litellm/types/integrations/langfuse.py
@ -0,0 +1,7 @@
+from typing import Optional, TypedDict
+
+
+class LangfuseLoggingConfig(TypedDict):
+    langfuse_secret: Optional[str]
+    langfuse_public_key: Optional[str]
+    langfuse_host: Optional[str]
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -210,15 +210,23 @@ class ServerSentEvent:
        return f"ServerSentEvent(event={self.event}, data={self.data}, id={self.id}, retry={self.retry})"


+COHERE_EMBEDDING_INPUT_TYPES = Literal[
+    "search_document", "search_query", "classification", "clustering", "image"
+]
+
+
 class CohereEmbeddingRequest(TypedDict, total=False):
-    texts: Required[List[str]]
-    input_type: Required[
-        Literal["search_document", "search_query", "classification", "clustering"]
-    ]
+    texts: List[str]
+    images: List[str]
+    input_type: Required[COHERE_EMBEDDING_INPUT_TYPES]
    truncate: Literal["NONE", "START", "END"]
    embedding_types: Literal["float", "int8", "uint8", "binary", "ubinary"]


+class CohereEmbeddingRequestWithModel(CohereEmbeddingRequest):
+    model: Required[str]
+
+
 class CohereEmbeddingResponse(TypedDict):
    embeddings: List[List[float]]
    id: str
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -970,9 +970,9 @@ class EmbeddingResponse(OpenAIObject):

 class Logprobs(OpenAIObject):
    text_offset: List[int]
-    token_logprobs: List[float]
+    token_logprobs: List[Union[float, None]]
    tokens: List[str]
-    top_logprobs: List[Dict[str, float]]
+    top_logprobs: List[Union[Dict[str, float], None]]


 class TextChoices(OpenAIObject):
@ -1177,12 +1177,15 @@ from openai.types.images_response import ImagesResponse as OpenAIImageResponse

 class ImageResponse(OpenAIImageResponse):
    _hidden_params: dict = {}
+    usage: Usage

    def __init__(
        self,
        created: Optional[int] = None,
        data: Optional[List[ImageObject]] = None,
        response_ms=None,
+        usage: Optional[Usage] = None,
+        hidden_params: Optional[dict] = None,
    ):
        if response_ms:
            _response_ms = response_ms
@ -1204,8 +1207,13 @@ class ImageResponse(OpenAIImageResponse):
                _data.append(ImageObject(**d))
            elif isinstance(d, BaseModel):
                _data.append(ImageObject(**d.model_dump()))
-        super().__init__(created=created, data=_data)
-        self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
+        _usage = usage or Usage(
+            prompt_tokens=0,
+            completion_tokens=0,
+            total_tokens=0,
+        )
+        super().__init__(created=created, data=_data, usage=_usage)  # type: ignore
+        self._hidden_params = hidden_params or {}

    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -1404,16 +1412,20 @@ class AdapterCompletionStreamWrapper:
            raise StopAsyncIteration


-class StandardLoggingMetadata(TypedDict):
+class StandardLoggingUserAPIKeyMetadata(TypedDict):
+    user_api_key_hash: Optional[str]  # hash of the litellm virtual key used
+    user_api_key_alias: Optional[str]
+    user_api_key_org_id: Optional[str]
+    user_api_key_team_id: Optional[str]
+    user_api_key_user_id: Optional[str]
+    user_api_key_team_alias: Optional[str]
+
+
+class StandardLoggingMetadata(StandardLoggingUserAPIKeyMetadata):
    """
    Specific metadata k,v pairs logged to integration for easier cost tracking
    """

-    user_api_key_hash: Optional[str]  # hash of the litellm virtual key used
-    user_api_key_alias: Optional[str]
-    user_api_key_team_id: Optional[str]
-    user_api_key_user_id: Optional[str]
-    user_api_key_team_alias: Optional[str]
    spend_logs_metadata: Optional[
        dict
    ]  # special param to log k,v pairs to spendlogs for a call
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -70,6 +70,12 @@ from litellm.litellm_core_utils.get_llm_provider_logic import (
    get_llm_provider,
 )
 from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe
+from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
+    _handle_invalid_parallel_tool_calls,
+    convert_to_model_response_object,
+    convert_to_streaming_response,
+    convert_to_streaming_response_async,
+)
 from litellm.litellm_core_utils.llm_response_utils.get_headers import (
    get_response_headers,
 )
@ -126,6 +132,7 @@ except (ImportError, AttributeError):
 os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv(
    "CUSTOM_TIKTOKEN_CACHE_DIR", filename
 )  # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071
+from tiktoken import Encoding

 encoding = tiktoken.get_encoding("cl100k_base")
 from importlib import resources
@ -213,13 +220,10 @@ prometheusLogger = None
 dynamoLogger = None
 s3Logger = None
 genericAPILogger = None
-clickHouseLogger = None
 greenscaleLogger = None
 lunaryLogger = None
 aispendLogger = None
-berrispendLogger = None
 supabaseClient = None
-liteDebuggerClient = None
 callback_list: Optional[List[str]] = []
 user_logger_fn = None
 additional_details: Optional[Dict[str, str]] = {}
@ -609,7 +613,6 @@ def function_setup(  # noqa: PLR0915


 def client(original_function):  # noqa: PLR0915
-    global liteDebuggerClient
    rules_obj = Rules()

    def check_coroutine(value) -> bool:
@ -1282,7 +1285,10 @@ def encode(model="", text="", custom_tokenizer: Optional[dict] = None):
        enc: The encoded text.
    """
    tokenizer_json = custom_tokenizer or _select_tokenizer(model=model)
-    enc = tokenizer_json["tokenizer"].encode(text)
+    if isinstance(tokenizer_json["tokenizer"], Encoding):
+        enc = tokenizer_json["tokenizer"].encode(text, disallowed_special=())
+    else:
+        enc = tokenizer_json["tokenizer"].encode(text)
    return enc


@ -3049,8 +3055,8 @@ def get_optional_params(  # noqa: PLR0915
        )
        if litellm.vertex_ai_safety_settings is not None:
            optional_params["safety_settings"] = litellm.vertex_ai_safety_settings
-    elif (
-        custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models
+    elif litellm.VertexAIAnthropicConfig.is_supported_model(
+        model=model, custom_llm_provider=custom_llm_provider
    ):
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
@ -5191,7 +5197,9 @@ def create_proxy_transport_and_mounts():


 def validate_environment(  # noqa: PLR0915
-    model: Optional[str] = None, api_key: Optional[str] = None
+    model: Optional[str] = None,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
 ) -> dict:
    """
    Checks if the environment variables are valid for the given model.
@ -5218,11 +5226,6 @@ def validate_environment(  # noqa: PLR0915
        _, custom_llm_provider, _, _ = get_llm_provider(model=model)
    except Exception:
        custom_llm_provider = None
-    # # check if llm provider part of model name
-    # if model.split("/",1)[0] in litellm.provider_list:
-    #     custom_llm_provider = model.split("/", 1)[0]
-    #     model = model.split("/", 1)[1]
-    #     custom_llm_provider_passed_in = True

    if custom_llm_provider:
        if custom_llm_provider == "openai":
@ -5491,476 +5494,20 @@ def validate_environment(  # noqa: PLR0915
            if "api_key" not in key.lower():
                new_missing_keys.append(key)
        missing_keys = new_missing_keys
+
+    if api_base is not None:
+        new_missing_keys = []
+        for key in missing_keys:
+            if "api_base" not in key.lower():
+                new_missing_keys.append(key)
+        missing_keys = new_missing_keys
+
+    if len(missing_keys) == 0:  # no missing keys
+        keys_in_environment = True
+
    return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}


-async def convert_to_streaming_response_async(response_object: Optional[dict] = None):
-    """
-    Asynchronously converts a response object to a streaming response.
-
-    Args:
-        response_object (Optional[dict]): The response object to be converted. Defaults to None.
-
-    Raises:
-        Exception: If the response object is None.
-
-    Yields:
-        ModelResponse: The converted streaming response object.
-
-    Returns:
-        None
-    """
-    if response_object is None:
-        raise Exception("Error in response object format")
-
-    model_response_object = ModelResponse(stream=True)
-
-    if model_response_object is None:
-        raise Exception("Error in response creating model response object")
-
-    choice_list = []
-
-    for idx, choice in enumerate(response_object["choices"]):
-        if (
-            choice["message"].get("tool_calls", None) is not None
-            and isinstance(choice["message"]["tool_calls"], list)
-            and len(choice["message"]["tool_calls"]) > 0
-            and isinstance(choice["message"]["tool_calls"][0], dict)
-        ):
-            pydantic_tool_calls = []
-            for index, t in enumerate(choice["message"]["tool_calls"]):
-                if "index" not in t:
-                    t["index"] = index
-                pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
-            choice["message"]["tool_calls"] = pydantic_tool_calls
-        delta = Delta(
-            content=choice["message"].get("content", None),
-            role=choice["message"]["role"],
-            function_call=choice["message"].get("function_call", None),
-            tool_calls=choice["message"].get("tool_calls", None),
-        )
-        finish_reason = choice.get("finish_reason", None)
-
-        if finish_reason is None:
-            finish_reason = choice.get("finish_details")
-
-        logprobs = choice.get("logprobs", None)
-
-        choice = StreamingChoices(
-            finish_reason=finish_reason, index=idx, delta=delta, logprobs=logprobs
-        )
-        choice_list.append(choice)
-
-    model_response_object.choices = choice_list
-
-    if "usage" in response_object and response_object["usage"] is not None:
-        setattr(
-            model_response_object,
-            "usage",
-            Usage(
-                completion_tokens=response_object["usage"].get("completion_tokens", 0),
-                prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
-                total_tokens=response_object["usage"].get("total_tokens", 0),
-            ),
-        )
-
-    if "id" in response_object:
-        model_response_object.id = response_object["id"]
-
-    if "created" in response_object:
-        model_response_object.created = response_object["created"]
-
-    if "system_fingerprint" in response_object:
-        model_response_object.system_fingerprint = response_object["system_fingerprint"]
-
-    if "model" in response_object:
-        model_response_object.model = response_object["model"]
-
-    yield model_response_object
-    await asyncio.sleep(0)
-
-
-def convert_to_streaming_response(response_object: Optional[dict] = None):
-    # used for yielding Cache hits when stream == True
-    if response_object is None:
-        raise Exception("Error in response object format")
-
-    model_response_object = ModelResponse(stream=True)
-    choice_list = []
-    for idx, choice in enumerate(response_object["choices"]):
-        delta = Delta(
-            content=choice["message"].get("content", None),
-            role=choice["message"]["role"],
-            function_call=choice["message"].get("function_call", None),
-            tool_calls=choice["message"].get("tool_calls", None),
-        )
-        finish_reason = choice.get("finish_reason", None)
-        if finish_reason is None:
-            # gpt-4 vision can return 'finish_reason' or 'finish_details'
-            finish_reason = choice.get("finish_details")
-        logprobs = choice.get("logprobs", None)
-        enhancements = choice.get("enhancements", None)
-        choice = StreamingChoices(
-            finish_reason=finish_reason,
-            index=idx,
-            delta=delta,
-            logprobs=logprobs,
-            enhancements=enhancements,
-        )
-
-        choice_list.append(choice)
-    model_response_object.choices = choice_list
-
-    if "usage" in response_object and response_object["usage"] is not None:
-        setattr(model_response_object, "usage", Usage())
-        model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0)  # type: ignore
-        model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
-        model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
-
-    if "id" in response_object:
-        model_response_object.id = response_object["id"]
-
-    if "created" in response_object:
-        model_response_object.created = response_object["created"]
-
-    if "system_fingerprint" in response_object:
-        model_response_object.system_fingerprint = response_object["system_fingerprint"]
-
-    if "model" in response_object:
-        model_response_object.model = response_object["model"]
-    yield model_response_object
-
-
-from collections import defaultdict
-
-
-def _handle_invalid_parallel_tool_calls(
-    tool_calls: List[ChatCompletionMessageToolCall],
-):
-    """
-    Handle hallucinated parallel tool call from openai - https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
-
-    Code modified from: https://github.com/phdowling/openai_multi_tool_use_parallel_patch/blob/main/openai_multi_tool_use_parallel_patch.py
-    """
-
-    if tool_calls is None:
-        return
-    try:
-        replacements: Dict[int, List[ChatCompletionMessageToolCall]] = defaultdict(list)
-        for i, tool_call in enumerate(tool_calls):
-            current_function = tool_call.function.name
-            function_args = json.loads(tool_call.function.arguments)
-            if current_function == "multi_tool_use.parallel":
-                verbose_logger.debug(
-                    "OpenAI did a weird pseudo-multi-tool-use call, fixing call structure.."
-                )
-                for _fake_i, _fake_tool_use in enumerate(function_args["tool_uses"]):
-                    _function_args = _fake_tool_use["parameters"]
-                    _current_function = _fake_tool_use["recipient_name"]
-                    if _current_function.startswith("functions."):
-                        _current_function = _current_function[len("functions.") :]
-
-                    fixed_tc = ChatCompletionMessageToolCall(
-                        id=f"{tool_call.id}_{_fake_i}",
-                        type="function",
-                        function=Function(
-                            name=_current_function, arguments=json.dumps(_function_args)
-                        ),
-                    )
-                    replacements[i].append(fixed_tc)
-
-        shift = 0
-        for i, replacement in replacements.items():
-            tool_calls[:] = (
-                tool_calls[: i + shift] + replacement + tool_calls[i + shift + 1 :]
-            )
-            shift += len(replacement)
-
-        return tool_calls
-    except json.JSONDecodeError:
-        # if there is a JSONDecodeError, return the original tool_calls
-        return tool_calls
-
-
-def convert_to_model_response_object(  # noqa: PLR0915
-    response_object: Optional[dict] = None,
-    model_response_object: Optional[
-        Union[
-            ModelResponse,
-            EmbeddingResponse,
-            ImageResponse,
-            TranscriptionResponse,
-            RerankResponse,
-        ]
-    ] = None,
-    response_type: Literal[
-        "completion", "embedding", "image_generation", "audio_transcription", "rerank"
-    ] = "completion",
-    stream=False,
-    start_time=None,
-    end_time=None,
-    hidden_params: Optional[dict] = None,
-    _response_headers: Optional[dict] = None,
-    convert_tool_call_to_json_mode: Optional[
-        bool
-    ] = None,  # used for supporting 'json_schema' on older models
-):
-    received_args = locals()
-
-    additional_headers = get_response_headers(_response_headers)
-
-    if hidden_params is None:
-        hidden_params = {}
-    hidden_params["additional_headers"] = additional_headers
-
-    ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
-    if (
-        response_object is not None
-        and "error" in response_object
-        and response_object["error"] is not None
-    ):
-        error_args = {"status_code": 422, "message": "Error in response object"}
-        if isinstance(response_object["error"], dict):
-            if "code" in response_object["error"]:
-                error_args["status_code"] = response_object["error"]["code"]
-            if "message" in response_object["error"]:
-                if isinstance(response_object["error"]["message"], dict):
-                    message_str = json.dumps(response_object["error"]["message"])
-                else:
-                    message_str = str(response_object["error"]["message"])
-                error_args["message"] = message_str
-        raised_exception = Exception()
-        setattr(raised_exception, "status_code", error_args["status_code"])
-        setattr(raised_exception, "message", error_args["message"])
-        raise raised_exception
-
-    try:
-        if response_type == "completion" and (
-            model_response_object is None
-            or isinstance(model_response_object, ModelResponse)
-        ):
-            if response_object is None or model_response_object is None:
-                raise Exception("Error in response object format")
-            if stream is True:
-                # for returning cached responses, we need to yield a generator
-                return convert_to_streaming_response(response_object=response_object)
-            choice_list = []
-
-            assert response_object["choices"] is not None and isinstance(
-                response_object["choices"], Iterable
-            )
-
-            for idx, choice in enumerate(response_object["choices"]):
-                ## HANDLE JSON MODE - anthropic returns single function call]
-                tool_calls = choice["message"].get("tool_calls", None)
-                if tool_calls is not None:
-                    _openai_tool_calls = []
-                    for _tc in tool_calls:
-                        _openai_tc = ChatCompletionMessageToolCall(**_tc)
-                        _openai_tool_calls.append(_openai_tc)
-                    fixed_tool_calls = _handle_invalid_parallel_tool_calls(
-                        _openai_tool_calls
-                    )
-
-                    if fixed_tool_calls is not None:
-                        tool_calls = fixed_tool_calls
-
-                message: Optional[Message] = None
-                finish_reason: Optional[str] = None
-                if (
-                    convert_tool_call_to_json_mode
-                    and tool_calls is not None
-                    and len(tool_calls) == 1
-                ):
-                    # to support 'json_schema' logic on older models
-                    json_mode_content_str: Optional[str] = tool_calls[0][
-                        "function"
-                    ].get("arguments")
-                    if json_mode_content_str is not None:
-                        message = litellm.Message(content=json_mode_content_str)
-                        finish_reason = "stop"
-                if message is None:
-                    message = Message(
-                        content=choice["message"].get("content", None),
-                        role=choice["message"]["role"] or "assistant",
-                        function_call=choice["message"].get("function_call", None),
-                        tool_calls=tool_calls,
-                        audio=choice["message"].get("audio", None),
-                    )
-                    finish_reason = choice.get("finish_reason", None)
-                if finish_reason is None:
-                    # gpt-4 vision can return 'finish_reason' or 'finish_details'
-                    finish_reason = choice.get("finish_details") or "stop"
-                logprobs = choice.get("logprobs", None)
-                enhancements = choice.get("enhancements", None)
-                choice = Choices(
-                    finish_reason=finish_reason,
-                    index=idx,
-                    message=message,
-                    logprobs=logprobs,
-                    enhancements=enhancements,
-                )
-                choice_list.append(choice)
-            model_response_object.choices = choice_list
-
-            if "usage" in response_object and response_object["usage"] is not None:
-                usage_object = litellm.Usage(**response_object["usage"])
-                setattr(model_response_object, "usage", usage_object)
-            if "created" in response_object:
-                model_response_object.created = response_object["created"] or int(
-                    time.time()
-                )
-
-            if "id" in response_object:
-                model_response_object.id = response_object["id"] or str(uuid.uuid4())
-
-            if "system_fingerprint" in response_object:
-                model_response_object.system_fingerprint = response_object[
-                    "system_fingerprint"
-                ]
-
-            if "model" in response_object:
-                if model_response_object.model is None:
-                    model_response_object.model = response_object["model"]
-                elif (
-                    "/" in model_response_object.model
-                    and response_object["model"] is not None
-                ):
-                    openai_compatible_provider = model_response_object.model.split("/")[
-                        0
-                    ]
-                    model_response_object.model = (
-                        openai_compatible_provider + "/" + response_object["model"]
-                    )
-
-            if start_time is not None and end_time is not None:
-                if isinstance(start_time, type(end_time)):
-                    model_response_object._response_ms = (  # type: ignore
-                        end_time - start_time
-                    ).total_seconds() * 1000
-
-            if hidden_params is not None:
-                if model_response_object._hidden_params is None:
-                    model_response_object._hidden_params = {}
-                model_response_object._hidden_params.update(hidden_params)
-
-            if _response_headers is not None:
-                model_response_object._response_headers = _response_headers
-
-            special_keys = list(litellm.ModelResponse.model_fields.keys())
-            special_keys.append("usage")
-            for k, v in response_object.items():
-                if k not in special_keys:
-                    setattr(model_response_object, k, v)
-
-            return model_response_object
-        elif response_type == "embedding" and (
-            model_response_object is None
-            or isinstance(model_response_object, EmbeddingResponse)
-        ):
-            if response_object is None:
-                raise Exception("Error in response object format")
-
-            if model_response_object is None:
-                model_response_object = EmbeddingResponse()
-
-            if "model" in response_object:
-                model_response_object.model = response_object["model"]
-
-            if "object" in response_object:
-                model_response_object.object = response_object["object"]
-
-            model_response_object.data = response_object["data"]
-
-            if "usage" in response_object and response_object["usage"] is not None:
-                model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0)  # type: ignore
-                model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0)  # type: ignore
-                model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0)  # type: ignore
-
-            if start_time is not None and end_time is not None:
-                model_response_object._response_ms = (  # type: ignore
-                    end_time - start_time
-                ).total_seconds() * 1000  # return response latency in ms like openai
-
-            if hidden_params is not None:
-                model_response_object._hidden_params = hidden_params
-
-            if _response_headers is not None:
-                model_response_object._response_headers = _response_headers
-
-            return model_response_object
-        elif response_type == "image_generation" and (
-            model_response_object is None
-            or isinstance(model_response_object, ImageResponse)
-        ):
-            if response_object is None:
-                raise Exception("Error in response object format")
-
-            if model_response_object is None:
-                model_response_object = ImageResponse()
-
-            if "created" in response_object:
-                model_response_object.created = response_object["created"]
-
-            if "data" in response_object:
-                model_response_object.data = response_object["data"]
-
-            if hidden_params is not None:
-                model_response_object._hidden_params = hidden_params
-
-            return model_response_object
-        elif response_type == "audio_transcription" and (
-            model_response_object is None
-            or isinstance(model_response_object, TranscriptionResponse)
-        ):
-            if response_object is None:
-                raise Exception("Error in response object format")
-
-            if model_response_object is None:
-                model_response_object = TranscriptionResponse()
-
-            if "text" in response_object:
-                model_response_object.text = response_object["text"]
-
-            optional_keys = ["language", "task", "duration", "words", "segments"]
-            for key in optional_keys:  # not guaranteed to be in response
-                if key in response_object:
-                    setattr(model_response_object, key, response_object[key])
-
-            if hidden_params is not None:
-                model_response_object._hidden_params = hidden_params
-
-            if _response_headers is not None:
-                model_response_object._response_headers = _response_headers
-
-            return model_response_object
-        elif response_type == "rerank" and (
-            model_response_object is None
-            or isinstance(model_response_object, RerankResponse)
-        ):
-            if response_object is None:
-                raise Exception("Error in response object format")
-
-            if model_response_object is None:
-                model_response_object = RerankResponse(**response_object)
-                return model_response_object
-
-            if "id" in response_object:
-                model_response_object.id = response_object["id"]
-
-            if "meta" in response_object:
-                model_response_object.meta = response_object["meta"]
-
-            if "results" in response_object:
-                model_response_object.results = response_object["results"]
-
-            return model_response_object
-    except Exception:
-        raise Exception(
-            f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
-        )
-
-
 def acreate(*args, **kwargs):  ## Thin client to handle the acreate langchain call
    return litellm.acompletion(*args, **kwargs)

--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1104,7 +1104,7 @@
        "litellm_provider": "azure_ai",
        "mode": "chat"
    },
-    "azure_ai/Meta-Llama-31-8B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-8B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1114,7 +1114,7 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
    },
-    "azure_ai/Meta-Llama-31-70B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-70B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1124,7 +1124,7 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
    },
-    "azure_ai/Meta-Llama-31-405B-Instruct": {
+    "azure_ai/Meta-Llama-3.1-405B-Instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
@ -1751,6 +1751,22 @@
        "supports_assistant_prefill": true,
        "supports_prompt_caching": true
    },
+    "claude-3-5-sonnet-20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
+        "litellm_provider": "anthropic",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "tool_use_system_prompt_tokens": 159,
+        "supports_assistant_prefill": true,
+        "supports_prompt_caching": true
+    },
    "text-bison": {
        "max_tokens": 2048,
        "max_input_tokens": 8192,
@ -2578,6 +2594,18 @@
        "supports_vision": true,
        "supports_assistant_prefill": true
    },
+    "vertex_ai/claude-3-5-sonnet-v2@20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_assistant_prefill": true
+    },
    "vertex_ai/claude-3-haiku@20240307": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -3336,54 +3364,56 @@
        "litellm_provider": "cohere",
        "mode": "rerank"
    },
-    "embed-english-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
-        "input_cost_per_token": 0.00000010,
-        "output_cost_per_token": 0.00000,
-        "litellm_provider": "cohere",
-        "mode": "embedding"
-    },
    "embed-english-light-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 4096, 
+        "max_input_tokens": 4096,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-light-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v2.0": {
-        "max_tokens": 256, 
-        "max_input_tokens": 256,
+        "max_tokens": 768, 
+        "max_input_tokens": 768,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
+    "embed-english-v3.0": {
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
+        "input_cost_per_token": 0.00000010,
+        "input_cost_per_image": 0.0001,
+        "output_cost_per_token": 0.00000,
+        "litellm_provider": "cohere",
+        "mode": "embedding",
+        "supports_image_input": true
+    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
@ -3572,6 +3602,22 @@
        "supports_vision": true,
        "tool_use_system_prompt_tokens": 264
    },
+    "anthropic/claude-3-5-sonnet-20241022": {
+        "max_tokens": 8192,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 8192,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
+        "litellm_provider": "anthropic",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "tool_use_system_prompt_tokens": 159,
+        "supports_assistant_prefill": true,
+        "supports_prompt_caching": true
+    },
    "openrouter/anthropic/claude-3.5-sonnet": {
        "max_tokens": 8192,
        "max_input_tokens": 200000,
@ -4093,6 +4139,18 @@
        "litellm_provider": "bedrock", 
        "mode": "embedding"
    },
+    "amazon.titan-embed-image-v1": {
+        "max_tokens": 128, 
+        "max_input_tokens": 128, 
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000008,
+        "input_cost_per_image": 0.00006,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "bedrock", 
+        "supports_image_input": true,
+        "mode": "embedding",
+        "source": "https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=amazon.titan-image-generator-v1"
+    },
    "mistral.mistral-7b-instruct-v0:2": {
        "max_tokens": 8191,
        "max_input_tokens": 32000,
@ -4246,6 +4304,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -4290,6 +4359,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "us.anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "us.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
@ -4334,6 +4414,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "eu.anthropic.claude-3-5-sonnet-20241022-v2:0": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "eu.anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
@ -6369,6 +6460,14 @@
        "litellm_provider": "voyage",
        "mode": "embedding"
    },
+    "voyage/voyage-finance-2": {
+        "max_tokens": 4000,
+        "max_input_tokens": 4000,
+        "input_cost_per_token": 0.00000012,
+        "output_cost_per_token": 0.000000,
+        "litellm_provider": "voyage",
+        "mode": "embedding"
+    },
    "databricks/databricks-meta-llama-3-1-405b-instruct": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.50.2"
+version = "1.51.0"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.50.2"
+version = "1.51.0"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/schema.prisma
+++ b/schema.prisma
@ -154,6 +154,8 @@ model LiteLLM_VerificationToken {
    model_spend      Json @default("{}")
    model_max_budget Json @default("{}")
    budget_id String?
+    created_at      DateTime?               @default(now()) @map("created_at")
+    updated_at      DateTime?               @default(now()) @updatedAt @map("updated_at")
    litellm_budget_table LiteLLM_BudgetTable?   @relation(fields: [budget_id], references: [budget_id])
 }

--- a/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
+++ b/tests/llm_translation/test_llm_response_utils/test_convert_dict_to_chat_completion.py
@ -695,3 +695,41 @@ def test_convert_to_model_response_object_error():
            _response_headers=None,
            convert_tool_call_to_json_mode=False,
        )
+
+
+def test_image_generation_openai_with_pydantic_warning(caplog):
+    try:
+        import logging
+        from litellm.types.utils import ImageResponse, ImageObject
+
+        convert_response_args = {
+            "response_object": {
+                "created": 1729709945,
+                "data": [
+                    {
+                        "b64_json": None,
+                        "revised_prompt": "Generate an image of a baby sea otter. It should look incredibly cute, with big, soulful eyes and a fluffy, wet fur coat. The sea otter should be on its back, as sea otters often do, with its tiny hands holding onto a shell as if it is its precious toy. The background should be a tranquil sea under a clear sky, with soft sunlight reflecting off the waters. The color palette should be soothing with blues, browns, and white.",
+                        "url": "https://oaidalleapiprodscus.blob.core.windows.net/private/org-ikDc4ex8NB5ZzfTf8m5WYVB7/user-JpwZsbIXubBZvan3Y3GchiiB/img-LL0uoOv4CFJIvNYxoNCKB8oc.png?st=2024-10-23T17%3A59%3A05Z&se=2024-10-23T19%3A59%3A05Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=d505667d-d6c1-4a0a-bac7-5c84a87759f8&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-10-22T19%3A26%3A22Z&ske=2024-10-23T19%3A26%3A22Z&sks=b&skv=2024-08-04&sig=Hl4wczJ3H2vZNdLRt/7JvNi6NvQGDnbNkDy15%2Bl3k5s%3D",
+                    }
+                ],
+            },
+            "model_response_object": ImageResponse(
+                created=1729709929,
+                data=[],
+            ),
+            "response_type": "image_generation",
+            "stream": False,
+            "start_time": None,
+            "end_time": None,
+            "hidden_params": None,
+            "_response_headers": None,
+            "convert_tool_call_to_json_mode": None,
+        }
+
+        resp: ImageResponse = convert_to_model_response_object(**convert_response_args)
+        assert resp is not None
+        assert resp.data is not None
+        assert len(resp.data) == 1
+        assert isinstance(resp.data[0], ImageObject)
+    except Exception as e:
+        pytest.fail(f"Test failed with exception: {e}")
--- a/tests/llm_translation/test_max_completion_tokens.py
+++ b/tests/llm_translation/test_max_completion_tokens.py
@ -235,7 +235,7 @@ def test_all_model_configs():
        optional_params={},
        api_version="2022-12-01",
        drop_params=False,
-    ) == {"max_tokens": 10}
+    ) == {"max_completion_tokens": 10}

    from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig

--- a/tests/llm_translation/test_optional_params.py
+++ b/tests/llm_translation/test_optional_params.py
@ -775,3 +775,12 @@ def test_hosted_vllm_tool_param():
    )
    assert "tools" not in optional_params
    assert "tool_choice" not in optional_params
+
+
+def test_unmapped_vertex_anthropic_model():
+    optional_params = get_optional_params(
+        model="claude-3-5-sonnet-v250@20241022",
+        custom_llm_provider="vertex_ai",
+        max_retries=10,
+    )
+    assert "max_retries" not in optional_params
--- a/tests/llm_translation/test_text_completion_unit_tests.py
+++ b/tests/llm_translation/test_text_completion_unit_tests.py
@ -0,0 +1,64 @@
+import json
+import os
+import sys
+from datetime import datetime
+from unittest.mock import AsyncMock
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+from litellm.types.utils import TextCompletionResponse
+
+
+def test_convert_dict_to_text_completion_response():
+    input_dict = {
+        "id": "cmpl-ALVLPJgRkqpTomotoOMi3j0cAaL4L",
+        "choices": [
+            {
+                "finish_reason": "length",
+                "index": 0,
+                "logprobs": {
+                    "text_offset": [0, 5],
+                    "token_logprobs": [None, -12.203847],
+                    "tokens": ["hello", " crisp"],
+                    "top_logprobs": [None, {",": -2.1568563}],
+                },
+                "text": "hello crisp",
+            }
+        ],
+        "created": 1729688739,
+        "model": "davinci-002",
+        "object": "text_completion",
+        "system_fingerprint": None,
+        "usage": {
+            "completion_tokens": 1,
+            "prompt_tokens": 1,
+            "total_tokens": 2,
+            "completion_tokens_details": None,
+            "prompt_tokens_details": None,
+        },
+    }
+
+    response = TextCompletionResponse(**input_dict)
+
+    assert response.id == "cmpl-ALVLPJgRkqpTomotoOMi3j0cAaL4L"
+    assert len(response.choices) == 1
+    assert response.choices[0].finish_reason == "length"
+    assert response.choices[0].index == 0
+    assert response.choices[0].text == "hello crisp"
+    assert response.created == 1729688739
+    assert response.model == "davinci-002"
+    assert response.object == "text_completion"
+    assert response.system_fingerprint is None
+    assert response.usage.completion_tokens == 1
+    assert response.usage.prompt_tokens == 1
+    assert response.usage.total_tokens == 2
+    assert response.usage.completion_tokens_details is None
+    assert response.usage.prompt_tokens_details is None
+
+    # Test logprobs
+    assert response.choices[0].logprobs.text_offset == [0, 5]
+    assert response.choices[0].logprobs.token_logprobs == [None, -12.203847]
+    assert response.choices[0].logprobs.tokens == ["hello", " crisp"]
+    assert response.choices[0].logprobs.top_logprobs == [None, {",": -2.1568563}]
--- a/tests/local_testing/test_alangfuse.py
+++ b/tests/local_testing/test_alangfuse.py
@ -428,11 +428,16 @@ async def test_aaalangfuse_logging_metadata(langfuse_client):

            await asyncio.sleep(2)
    langfuse_client.flush()
-    # await asyncio.sleep(10)
+    await asyncio.sleep(4)

    # Tests the metadata filtering and the override of the output to be the last generation
    for trace_id, generation_ids in trace_identifiers.items():
-        trace = langfuse_client.get_trace(id=trace_id)
+        try:
+            trace = langfuse_client.get_trace(id=trace_id)
+        except Exception as e:
+            if "Trace not found within authorized project" in str(e):
+                print(f"Trace {trace_id} not found")
+                continue
        assert trace.id == trace_id
        assert trace.session_id == session_id
        assert trace.metadata != trace_metadata
@ -620,7 +625,7 @@ def test_aaalangfuse_existing_trace_id():
    import datetime

    import litellm
-    from litellm.integrations.langfuse import LangFuseLogger
+    from litellm.integrations.langfuse.langfuse import LangFuseLogger

    langfuse_Logger = LangFuseLogger(
        langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
@ -1120,7 +1125,7 @@ generation_params = {
 )
 def test_langfuse_prompt_type(prompt):

-    from litellm.integrations.langfuse import _add_prompt_to_generation_params
+    from litellm.integrations.langfuse.langfuse import _add_prompt_to_generation_params

    clean_metadata = {
        "prompt": {
@ -1227,7 +1232,7 @@ def test_langfuse_prompt_type(prompt):


 def test_langfuse_logging_metadata():
-    from litellm.integrations.langfuse import log_requester_metadata
+    from litellm.integrations.langfuse.langfuse import log_requester_metadata

    metadata = {"key": "value", "requester_metadata": {"key": "value"}}

--- a/tests/local_testing/test_arize_ai.py
+++ b/tests/local_testing/test_arize_ai.py
@ -10,9 +10,9 @@ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanE
 import litellm
 from litellm._logging import verbose_logger, verbose_proxy_logger
 from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
+from litellm.integrations.arize_ai import ArizeConfig, ArizeLogger

 load_dotenv()
-import logging


@pytest.mark.asyncio()
@ -32,3 +32,57 @@ async def test_async_otel_callback():
    )

    await asyncio.sleep(2)
+
+
+@pytest.fixture
+def mock_env_vars(monkeypatch):
+    monkeypatch.setenv("ARIZE_SPACE_KEY", "test_space_key")
+    monkeypatch.setenv("ARIZE_API_KEY", "test_api_key")
+
+
+def test_get_arize_config(mock_env_vars):
+    """
+    Use Arize default endpoint when no endpoints are provided
+    """
+    config = ArizeLogger._get_arize_config()
+    assert isinstance(config, ArizeConfig)
+    assert config.space_key == "test_space_key"
+    assert config.api_key == "test_api_key"
+    assert config.grpc_endpoint == "https://otlp.arize.com/v1"
+    assert config.http_endpoint is None
+
+
+def test_get_arize_config_with_endpoints(mock_env_vars, monkeypatch):
+    """
+    Use provided endpoints when they are set
+    """
+    monkeypatch.setenv("ARIZE_ENDPOINT", "grpc://test.endpoint")
+    monkeypatch.setenv("ARIZE_HTTP_ENDPOINT", "http://test.endpoint")
+
+    config = ArizeLogger._get_arize_config()
+    assert config.grpc_endpoint == "grpc://test.endpoint"
+    assert config.http_endpoint == "http://test.endpoint"
+
+
+def test_get_arize_opentelemetry_config_grpc(mock_env_vars, monkeypatch):
+    """
+    Use provided GRPC endpoint when it is set
+    """
+    monkeypatch.setenv("ARIZE_ENDPOINT", "grpc://test.endpoint")
+
+    config = ArizeLogger.get_arize_opentelemetry_config()
+    assert isinstance(config, OpenTelemetryConfig)
+    assert config.exporter == "otlp_grpc"
+    assert config.endpoint == "grpc://test.endpoint"
+
+
+def test_get_arize_opentelemetry_config_http(mock_env_vars, monkeypatch):
+    """
+    Use provided HTTP endpoint when it is set
+    """
+    monkeypatch.setenv("ARIZE_HTTP_ENDPOINT", "http://test.endpoint")
+
+    config = ArizeLogger.get_arize_opentelemetry_config()
+    assert isinstance(config, OpenTelemetryConfig)
+    assert config.exporter == "otlp_http"
+    assert config.endpoint == "http://test.endpoint"
--- a/tests/local_testing/test_audit_logs_proxy.py
+++ b/tests/local_testing/test_audit_logs_proxy.py
@ -0,0 +1,152 @@
+import os
+import sys
+import traceback
+import uuid
+from datetime import datetime
+
+from dotenv import load_dotenv
+from fastapi import Request
+from fastapi.routing import APIRoute
+
+
+import io
+import os
+import time
+
+# this file is to test litellm/proxy
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import asyncio
+import logging
+
+load_dotenv()
+
+import pytest
+import uuid
+import litellm
+from litellm._logging import verbose_proxy_logger
+
+from litellm.proxy.proxy_server import (
+    LitellmUserRoles,
+    audio_transcriptions,
+    chat_completion,
+    completion,
+    embeddings,
+    image_generation,
+    model_list,
+    moderations,
+    new_end_user,
+    user_api_key_auth,
+)
+
+from litellm.proxy.utils import PrismaClient, ProxyLogging, hash_token, update_spend
+
+verbose_proxy_logger.setLevel(level=logging.DEBUG)
+
+from starlette.datastructures import URL
+
+from litellm.proxy.management_helpers.audit_logs import create_audit_log_for_update
+from litellm.proxy._types import LiteLLM_AuditLogs, LitellmTableNames
+from litellm.caching.caching import DualCache
+from unittest.mock import patch, AsyncMock
+
+proxy_logging_obj = ProxyLogging(user_api_key_cache=DualCache())
+import json
+
+
+@pytest.mark.asyncio
+async def test_create_audit_log_for_update_premium_user():
+    """
+    Basic unit test for create_audit_log_for_update
+
+    Test that the audit log is created when a premium user updates a team
+    """
+    with patch("litellm.proxy.proxy_server.premium_user", True), patch(
+        "litellm.store_audit_logs", True
+    ), patch("litellm.proxy.proxy_server.prisma_client") as mock_prisma:
+
+        mock_prisma.db.litellm_auditlog.create = AsyncMock()
+
+        request_data = LiteLLM_AuditLogs(
+            id="test_id",
+            updated_at=datetime.now(),
+            changed_by="test_changed_by",
+            action="updated",
+            table_name=LitellmTableNames.TEAM_TABLE_NAME,
+            object_id="test_object_id",
+            updated_values=json.dumps({"key": "value"}),
+            before_value=json.dumps({"old_key": "old_value"}),
+        )
+
+        await create_audit_log_for_update(request_data)
+
+        mock_prisma.db.litellm_auditlog.create.assert_called_once_with(
+            data={
+                "id": "test_id",
+                "updated_at": request_data.updated_at,
+                "changed_by": request_data.changed_by,
+                "action": request_data.action,
+                "table_name": request_data.table_name,
+                "object_id": request_data.object_id,
+                "updated_values": request_data.updated_values,
+                "before_value": request_data.before_value,
+            }
+        )
+
+
+@pytest.fixture
+def prisma_client():
+    from litellm.proxy.proxy_cli import append_query_params
+
+    ### add connection pool + pool timeout args
+    params = {"connection_limit": 100, "pool_timeout": 60}
+    database_url = os.getenv("DATABASE_URL")
+    modified_url = append_query_params(database_url, params)
+    os.environ["DATABASE_URL"] = modified_url
+
+    # Assuming PrismaClient is a class that needs to be instantiated
+    prisma_client = PrismaClient(
+        database_url=os.environ["DATABASE_URL"], proxy_logging_obj=proxy_logging_obj
+    )
+
+    return prisma_client
+
+
+@pytest.mark.asyncio()
+async def test_create_audit_log_in_db(prisma_client):
+    print("prisma client=", prisma_client)
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm.proxy.proxy_server, "premium_user", True)
+    setattr(litellm, "store_audit_logs", True)
+
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    audit_log_id = f"audit_log_id_{uuid.uuid4()}"
+
+    # create a audit log for /key/generate
+    request_data = LiteLLM_AuditLogs(
+        id=audit_log_id,
+        updated_at=datetime.now(),
+        changed_by="test_changed_by",
+        action="updated",
+        table_name=LitellmTableNames.TEAM_TABLE_NAME,
+        object_id="test_object_id",
+        updated_values=json.dumps({"key": "value"}),
+        before_value=json.dumps({"old_key": "old_value"}),
+    )
+
+    await create_audit_log_for_update(request_data)
+
+    await asyncio.sleep(1)
+
+    # now read the last log from the db
+    last_log = await prisma_client.db.litellm_auditlog.find_first(
+        where={"id": audit_log_id}
+    )
+
+    assert last_log.id == audit_log_id
+
+    setattr(litellm, "store_audit_logs", False)
--- a/tests/local_testing/test_caching.py
+++ b/tests/local_testing/test_caching.py
@ -2066,46 +2066,6 @@ async def test_cache_default_off_acompletion():
    assert response3.id == response4.id


-@pytest.mark.asyncio()
-@pytest.mark.skip(reason="dual caching should first prioritze local cache")
-async def test_dual_cache_uses_redis():
-    """
-
-    - Store diff values in redis and in memory cache
-    - call get cache
-    - Assert that value from redis is used
-    """
-    litellm.set_verbose = True
-    from litellm.caching.caching import DualCache, RedisCache
-
-    current_usage = uuid.uuid4()
-
-    _cache_obj = DualCache(redis_cache=RedisCache(), always_read_redis=True)
-
-    # set cache
-    await _cache_obj.async_set_cache(key=f"current_usage: {current_usage}", value=10)
-
-    # modify value of in memory cache
-    _cache_obj.in_memory_cache.cache_dict[f"current_usage: {current_usage}"] = 1
-
-    # get cache
-    value = await _cache_obj.async_get_cache(key=f"current_usage: {current_usage}")
-    print("value from dual cache", value)
-    assert value == 10
-
-
-@pytest.mark.asyncio()
-async def test_proxy_logging_setup():
-    """
-    Assert always_read_redis is True when used by internal usage cache
-    """
-    from litellm.caching.caching import DualCache
-    from litellm.proxy.utils import ProxyLogging
-
-    pl_obj = ProxyLogging(user_api_key_cache=DualCache())
-    assert pl_obj.internal_usage_cache.dual_cache.always_read_redis is True
-
-
@pytest.mark.skip(reason="local test. Requires sentinel setup.")
@pytest.mark.asyncio
 async def test_redis_sentinel_caching():
--- a/tests/local_testing/test_clickhouse_logger.py
+++ b/tests/local_testing/test_clickhouse_logger.py
@ -1,42 +0,0 @@
-import sys
-import os
-import io, asyncio
-
-# import logging
-# logging.basicConfig(level=logging.DEBUG)
-sys.path.insert(0, os.path.abspath("../.."))
-print("Modified sys.path:", sys.path)
-
-
-from litellm import completion
-import litellm
-from litellm._logging import verbose_logger
-import logging
-
-litellm.num_retries = 3
-
-import time, random
-import pytest
-
-
-@pytest.mark.asyncio
-@pytest.mark.skip(reason="beta test - this is a new feature")
-async def test_custom_api_logging():
-    try:
-        litellm.success_callback = ["clickhouse"]
-        litellm.set_verbose = True
-        verbose_logger.setLevel(logging.DEBUG)
-        await litellm.acompletion(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": f"This is a test"}],
-            max_tokens=10,
-            temperature=0.7,
-            user="ishaan-2",
-        )
-
-    except Exception as e:
-        pytest.fail(f"An exception occurred - {e}")
-    finally:
-        # post, close log file and verify
-        # Reset stdout to the original value
-        print("Passed!")
--- a/Show more
+++ b/Show more