Merge branch 'main' into litellm_call_id_in_response

2024-07-11 21:54:49 -07:00 · 2024-07-11 21:54:49 -07:00 · 72f1c9181d
commit 72f1c9181d
parent 982603714e 79d6b69d1c
119 changed files with 4737 additions and 1868 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -243,7 +243,102 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests
+          no_output_timeout: 120m
+
+      # Store test results
+      - store_test_results:
+          path: test-results
+  proxy_log_to_otel_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            pip install openai
+            python -m pip install --upgrade pip
+            python -m pip install -r .circleci/requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-mock==3.12.0"
+            pip install "pytest-asyncio==0.21.1"
+            pip install mypy
+            pip install pyarrow
+            pip install numpydoc
+            pip install prisma            
+            pip install fastapi            
+            pip install jsonschema   
+            pip install "httpx==0.24.1"
+            pip install "anyio==3.7.1"
+            pip install "asyncio==3.4.3"
+            pip install "PyGithub==1.59.1"
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f Dockerfile.database .
+      - run:
+          name: Run Docker container
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e OPENAI_API_KEY=$OPENAI_API_KEY \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e OTEL_EXPORTER="in_memory" \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for app to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/otel_tests/test_otel.py -x --junitxml=test-results/junit.xml --durations=5
          no_output_timeout: 120m

      # Store test results
@ -337,6 +432,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - proxy_log_to_otel_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - installing_litellm_on_python:
          filters:
            branches:
@ -347,6 +448,7 @@ workflows:
          requires:
            - local_testing
            - build_and_test
+            - proxy_log_to_otel_tests
          filters:
            branches:
              only:
--- a/deploy/charts/litellm-helm/index.yaml
+++ b/deploy/charts/litellm-helm/index.yaml
@ -1,88 +0,0 @@
-apiVersion: v1
-entries:
-  postgresql:
-  - annotations:
-      category: Database
-      images: |
-        - name: os-shell
-          image: docker.io/bitnami/os-shell:12-debian-12-r16
-        - name: postgres-exporter
-          image: docker.io/bitnami/postgres-exporter:0.15.0-debian-12-r14
-        - name: postgresql
-          image: docker.io/bitnami/postgresql:16.2.0-debian-12-r6
-      licenses: Apache-2.0
-    apiVersion: v2
-    appVersion: 16.2.0
-    created: "2024-07-08T11:05:19.312515+08:00"
-    dependencies:
-    - name: common
-      repository: oci://registry-1.docker.io/bitnamicharts
-      tags:
-      - bitnami-common
-      version: 2.x.x
-    description: PostgreSQL (Postgres) is an open source object-relational database
-      known for reliability and data integrity. ACID-compliant, it supports foreign
-      keys, joins, views, triggers and stored procedures.
-    digest: 3c8125526b06833df32e2f626db34aeaedb29d38f03d15349db6604027d4a167
-    home: https://bitnami.com
-    icon: https://bitnami.com/assets/stacks/postgresql/img/postgresql-stack-220x234.png
-    keywords:
-    - postgresql
-    - postgres
-    - database
-    - sql
-    - replication
-    - cluster
-    maintainers:
-    - name: VMware, Inc.
-      url: https://github.com/bitnami/charts
-    name: postgresql
-    sources:
-    - https://github.com/bitnami/charts/tree/main/bitnami/postgresql
-    urls:
-    - https://berriai.github.io/litellm/charts/postgresql-14.3.1.tgz
-    version: 14.3.1
-  redis:
-  - annotations:
-      category: Database
-      images: |
-        - name: kubectl
-          image: docker.io/bitnami/kubectl:1.29.2-debian-12-r3
-        - name: os-shell
-          image: docker.io/bitnami/os-shell:12-debian-12-r16
-        - name: redis
-          image: docker.io/bitnami/redis:7.2.4-debian-12-r9
-        - name: redis-exporter
-          image: docker.io/bitnami/redis-exporter:1.58.0-debian-12-r4
-        - name: redis-sentinel
-          image: docker.io/bitnami/redis-sentinel:7.2.4-debian-12-r7
-      licenses: Apache-2.0
-    apiVersion: v2
-    appVersion: 7.2.4
-    created: "2024-07-08T11:05:19.317065+08:00"
-    dependencies:
-    - name: common
-      repository: oci://registry-1.docker.io/bitnamicharts
-      tags:
-      - bitnami-common
-      version: 2.x.x
-    description: Redis(R) is an open source, advanced key-value store. It is often
-      referred to as a data structure server since keys can contain strings, hashes,
-      lists, sets and sorted sets.
-    digest: b2fa1835f673a18002ca864c54fadac3c33789b26f6c5e58e2851b0b14a8f984
-    home: https://bitnami.com
-    icon: https://bitnami.com/assets/stacks/redis/img/redis-stack-220x234.png
-    keywords:
-    - redis
-    - keyvalue
-    - database
-    maintainers:
-    - name: VMware, Inc.
-      url: https://github.com/bitnami/charts
-    name: redis
-    sources:
-    - https://github.com/bitnami/charts/tree/main/bitnami/redis
-    urls:
-    - https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
-    version: 18.19.1
-generated: "2024-07-08T11:05:19.308028+08:00"
--- a/docs/my-website/docs/anthropic_completion.md
+++ b/docs/my-website/docs/anthropic_completion.md
@ -0,0 +1,54 @@
+# [BETA] Anthropic `/v1/messages`
+
+Call 100+ LLMs in the Anthropic format. 
+
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: my-test-model
+    litellm_params:
+      model: gpt-3.5-turbo
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'x-api-key: sk-1234' \
+-H 'content-type: application/json' \
+-D '{
+    "model": "my-test-model",
+    "max_tokens": 1024,
+    "messages": [
+        {"role": "user", "content": "Hello, world"}
+    ]
+}'
+```
+
+## Test with Anthropic SDK 
+
+```python
+import os
+from anthropic import Anthropic
+
+client = Anthropic(api_key="sk-1234", base_url="http://0.0.0.0:4000") # 👈 CONNECT TO PROXY
+
+message = client.messages.create(
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="my-test-model", # 👈 set 'model_name'
+)
+print(message.content)
+```
--- a/docs/my-website/docs/assistants.md
+++ b/docs/my-website/docs/assistants.md
@ -26,6 +26,7 @@ Call an existing Assistant.

 - Run the Assistant on the Thread to generate a response by calling the model and the tools.

+### SDK + PROXY
 <Tabs>
 <TabItem value="sdk" label="SDK">

@ -281,3 +282,31 @@ curl -X POST 'http://0.0.0.0:4000/threads/{thread_id}/runs' \
 </Tabs>

 ## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/assistants)
+
+## OpenAI-Compatible APIs 
+
+To call openai-compatible Assistants API's (eg. Astra Assistants API), just add `openai/` to the model name: 
+
+
+**config**
+```yaml
+assistant_settings:
+  custom_llm_provider: openai
+  litellm_params: 
+    api_key: os.environ/ASTRA_API_KEY
+    api_base: os.environ/ASTRA_API_BASE
+```
+
+**curl**
+
+```bash
+curl -X POST "http://localhost:4000/v1/assistants" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
+    "name": "Math Tutor",
+    "tools": [{"type": "code_interpreter"}],
+    "model": "openai/<my-astra-model-name>"
+  }'
+```
--- a/docs/my-website/docs/data_security.md
+++ b/docs/my-website/docs/data_security.md
@ -0,0 +1,34 @@
+# Data Privacy and Security
+
+## Security Measures
+
+### LiteLLM Cloud
+
+- We encrypt all data stored using your `LITELLM_MASTER_KEY` and in transit using TLS.
+- Our database and application run on GCP, AWS infrastructure, partly managed by NeonDB.
+    - US data region: Northern California (AWS/GCP `us-west-1`) & Virginia (AWS `us-east-1`)
+    - EU data region Germany/Frankfurt (AWS/GCP `eu-central-1`)
+- All users have access to SSO (Single Sign-On) through OAuth 2.0 with Google, Okta, Microsoft, KeyCloak. 
+- Audit Logs with retention policy
+- Control Allowed IP Addresses that can access your Cloud LiteLLM Instance
+
+For security inquiries, please contact us at support@berri.ai
+
+### Supported data regions for LiteLLM Cloud
+
+LiteLLM supports the following data regions:
+
+- US, Northern California (AWS/GCP `us-west-1`)
+- Europe, Frankfurt, Germany (AWS/GCP `eu-central-1`)
+
+All data, user accounts, and infrastructure are completely separated between these two regions
+
+### Security Vulnerability Reporting Guidelines
+
+We value the security community's role in protecting our systems and users. To report a security vulnerability:
+
+- Email support@berri.ai with details
+- Include steps to reproduce the issue
+- Provide any relevant additional information
+
+We'll review all reports promptly. Note that we don't currently offer a bug bounty program.
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -24,6 +24,7 @@ This covers:
        - ✅ [JWT-Auth](../docs/proxy/token_auth.md)
        - ✅ [Control available public, private routes](./proxy/enterprise#control-available-public-private-routes)
        - ✅ [[BETA] AWS Key Manager v2 - Key Decryption](./proxy/enterprise#beta-aws-key-manager---key-decryption)
+        - ✅ IP address‑based access control lists
        - ✅ Track Request IP Address
        - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
        - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
--- a/docs/my-website/docs/hosted.md
+++ b/docs/my-website/docs/hosted.md
@ -21,6 +21,14 @@ See our status page for [**live reliability**](https://status.litellm.ai/)
 - **Reliable**: Our hosted proxy is tested on 1k requests per second, making it reliable for high load.
 - **Secure**: LiteLLM is currently undergoing SOC-2 compliance, to make sure your data is as secure as possible.

+## Data Privacy & Security
+
+You can find our [data privacy & security policy for cloud litellm here](../docs/data_security#litellm-cloud)
+
+## Supported data regions for LiteLLM Cloud
+
+You can find [supported data regions litellm here](../docs/data_security#supported-data-regions-for-litellm-cloud)
+
 ### Pricing

 Pricing is based on usage. We can figure out a price that works for your team, on the call. 
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -18,6 +18,7 @@ Features:
    - ✅ [JWT-Auth](../docs/proxy/token_auth.md)
    - ✅ [Control available public, private routes](#control-available-public-private-routes)
    - ✅ [[BETA] AWS Key Manager v2 - Key Decryption](#beta-aws-key-manager---key-decryption)
+    - ✅ IP address‑based access control lists
    - ✅ Track Request IP Address
    - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
    - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
--- a/docs/my-website/docs/proxy/health.md
+++ b/docs/my-website/docs/proxy/health.md
@ -112,37 +112,52 @@ model_list:
      mode: completion # 👈 ADD THIS
 ```

+### Speech to Text Models 
+
+```yaml
+model_list:
+  - model_name: whisper
+    litellm_params:
+      model: whisper-1
+      api_key: os.environ/OPENAI_API_KEY
+    model_info:
+      mode: audio_transcription
+```
+
+
 ## `/health/readiness`

 Unprotected endpoint for checking if proxy is ready to accept requests

 Example Request: 

-```bash 
-curl --location 'http://0.0.0.0:4000/health/readiness'
+```bash
+curl http://0.0.0.0:4000/health/readiness
 ```

 Example Response:  

-*If proxy connected to a database*  
-
 ```json
 {
-    "status": "healthy",
-    "db": "connected",
-    "litellm_version":"1.19.2",
+  "status": "connected",
+  "db": "connected",
+  "cache": null,
+  "litellm_version": "1.40.21",
+  "success_callbacks": [
+    "langfuse",
+    "_PROXY_track_cost_callback",
+    "response_taking_too_long_callback",
+    "_PROXY_MaxParallelRequestsHandler",
+    "_PROXY_MaxBudgetLimiter",
+    "_PROXY_CacheControlCheck",
+    "ServiceLogging"
+  ],
+  "last_updated": "2024-07-10T18:59:10.616968"
 }
 ```

-*If proxy not connected to a database*  
-
-```json
-{
-    "status": "healthy",
-    "db": "Not connected",
-    "litellm_version":"1.19.2",
-}
-```
+If the proxy is not connected to a database, then the `"db"` field will be `"Not
+connected"` instead of `"connected"` and the `"last_updated"` field will not be present.

 ## `/health/liveliness`

--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -1,27 +1,19 @@
+# 🪢 Logging
+
+Log Proxy input, output, and exceptions using:
+
+- Langfuse
+- OpenTelemetry
+- Custom Callbacks
+- DataDog
+- DynamoDB
+- s3 Bucket
+- etc.
+
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-
-# 🪢 Logging - Langfuse, OpenTelemetry, Custom Callbacks, DataDog, s3 Bucket, Sentry, Athina, Azure Content-Safety
-
-Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket
-
-## Table of Contents
-
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
- [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format)
- [Async Custom Callbacks](#custom-callback-class-async)
- [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to Galileo](#logging-llm-io-to-galileo)
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
- [Logging to Sentry](#logging-proxy-inputoutput---sentry)
- [Logging to Athina](#logging-proxy-inputoutput-athina)
- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety)
-
 ## Getting the LiteLLM Call ID

 LiteLLM generates a unique `call_id` for each request. This `call_id` can be
@ -56,6 +48,7 @@ A number of these headers could be useful for troubleshooting, but the
 components in your system, including in logging tools.

 ## Logging Proxy Input/Output - Langfuse
+
 We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment

 **Step 1** Install langfuse
@ -65,6 +58,7 @@ pip install langfuse>=2.0.0
 ```

 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -75,6 +69,7 @@ litellm_settings:
 ```

 **Step 3**: Set required env variables for logging to langfuse
+
 ```shell
 export LANGFUSE_PUBLIC_KEY="pk_kk"
 export LANGFUSE_SECRET_KEY="sk_ss"
@ -85,11 +80,13 @@ export LANGFUSE_HOST="https://xxx.langfuse.com"
 **Step 4**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```
 litellm --test
 ```
@ -100,7 +97,6 @@ Expected output on Langfuse

 ### Logging Metadata to Langfuse

-
 <Tabs>

 <TabItem value="Curl" label="Curl Request">
@ -126,6 +122,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }
 }'
 ```
+
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">

@ -159,6 +156,7 @@ response = client.chat.completions.create(

 print(response)
 ```
+
 </TabItem>
 <TabItem value="langchain" label="Langchain">

@ -201,7 +199,6 @@ print(response)
 </TabItem>
 </Tabs>

-
 ### Team based Logging to Langfuse

 **Example:**
@ -290,6 +287,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }
 }'
 ```
+
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">

@ -320,6 +318,7 @@ response = client.chat.completions.create(

 print(response)
 ```
+
 </TabItem>
 <TabItem value="langchain" label="Langchain">

@ -365,7 +364,6 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma

 <Image img={require('../../img/debug_langfuse.png')} />

-
 ## Logging Proxy Input/Output in OpenTelemetry format

 :::info 
@ -381,10 +379,8 @@ OTEL_SERVICE_NAME=<your-service-name>` # default="litellm"

 <Tabs>

-
 <TabItem value="Console Exporter" label="Log to console">

-
 **Step 1:** Set callbacks and env vars

 Add the following to your env
@ -400,7 +396,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```

-
 **Step 2**: Start the proxy, make a test request

 Start proxy
@ -460,7 +455,6 @@ This is the Span from OTEL Logging

 </TabItem>

-
 <TabItem value="Honeycomb" label="Log to Honeycomb">

 #### Quick Start - Log to Honeycomb
@ -482,7 +476,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```

-
 **Step 2**: Start the proxy, make a test request

 Start proxy
@ -507,10 +500,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }'
 ```

-
 </TabItem>

-
 <TabItem value="otel-col" label="Log to OTEL HTTP Collector">

 #### Quick Start - Log to OTEL Collector
@ -532,7 +523,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```

-
 **Step 2**: Start the proxy, make a test request

 Start proxy
@ -559,7 +549,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \

 </TabItem>

-
 <TabItem value="otel-col-grpc" label="Log to OTEL GRPC Collector">

 #### Quick Start - Log to OTEL GRPC Collector
@ -581,7 +570,6 @@ litellm_settings:
  callbacks: ["otel"]
 ```

-
 **Step 2**: Start the proxy, make a test request

 Start proxy
@ -606,7 +594,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
    }'
 ```

-
 </TabItem>

 <TabItem value="traceloop" label="Log to Traceloop Cloud">
@ -629,7 +616,6 @@ environment_variables:
  TRACELOOP_API_KEY: "XXXXX"
 ```

-
 **Step 3**: Start the proxy, make a test request

 Start proxy
@ -665,11 +651,15 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**

 ✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header)
+
 ```curl
 traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01
 ```
+
 Example Usage
+
 1. Make Request to LiteLLM Proxy with `traceparent` header
+
 ```python
 import openai
 import uuid
@ -693,7 +683,6 @@ response = client.chat.completions.create(
 )

 print(response)
-
 ```

 ```shell
@ -707,12 +696,12 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector

 <Image img={require('../../img/otel_parent.png')} />

-
-
 ## Custom Callback Class [Async]
+
 Use this when you want to run custom callbacks in `python`

 #### Step 1 - Create your custom `litellm` callback class
+
 We use `litellm.integrations.custom_logger` for this, **more details about litellm custom callbacks [here](https://docs.litellm.ai/docs/observability/custom_callback)**

 Define your custom callback class in a python file.
@ -815,16 +804,17 @@ proxy_handler_instance = MyCustomHandler()
 ```

 #### Step 2 - Pass your custom callback class in `config.yaml`
+
 We pass the custom callback class defined in **Step1** to the config.yaml. 
 Set `callbacks` to `python_filename.logger_instance_name`

 In the config below, we pass
+
 - python_filename: `custom_callbacks.py`
 - logger_instance_name: `proxy_handler_instance`. This is defined in Step 1

 `callbacks: custom_callbacks.proxy_handler_instance`

-
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -837,6 +827,7 @@ litellm_settings:
 ```

 #### Step 3 - Start proxy + test request
+
 ```shell
 litellm --config proxy_config.yaml
 ```
@ -858,6 +849,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ```

 #### Resulting Log on Proxy
+
 ```shell
 On Success
    Model: gpt-3.5-turbo,
@ -910,7 +902,6 @@ class MyCustomHandler(CustomLogger):
    "max_tokens": 10
  }
 }
-
 ```

 #### Logging `model_info` set in config.yaml 
@ -928,11 +919,13 @@ class MyCustomHandler(CustomLogger):
 ```

 **Expected Output**
+
 ```json
 {'mode': 'embedding', 'input_cost_per_token': 0.002}
 ```

 ### Logging responses from proxy
+
 Both `/chat/completions` and `/embeddings` responses are available as `response_obj`

 **Note: for `/chat/completions`, both `stream=True` and `non stream` responses are available as `response_obj`**
@ -946,6 +939,7 @@ class MyCustomHandler(CustomLogger):
 ```

 **Expected Output /chat/completion [for both `stream` and `non-stream` responses]**
+
 ```json
 ModelResponse(
    id='chatcmpl-8Tfu8GoMElwOZuj2JlHBhNHG01PPo',
@ -972,6 +966,7 @@ ModelResponse(
 ```

 **Expected Output /embeddings**
+
 ```json
 {
    'model': 'ada',
@ -991,7 +986,6 @@ ModelResponse(
 }
 ```

-
 ## Custom Callback APIs [Async]

 :::info
@ -1001,10 +995,12 @@ This is an Enterprise only feature [Get Started with Enterprise here](https://gi
 :::

 Use this if you:
+
 - Want to use custom callbacks written in a non Python programming language
 - Want your callbacks to run on a different microservice

 #### Step 1. Create your generic logging API endpoint
+
 Set up a generic API endpoint that can receive data in JSON format. The data will be included within a "data" field. 

 Your server should support the following Request format:
@ -1067,11 +1063,8 @@ async def log_event(request: Request):
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="127.0.0.1", port=4000)
-
-
 ```

-
 #### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to

 ```shell
@ -1081,6 +1074,7 @@ os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
 #### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]

 Example litellm proxy config.yaml
+
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1092,8 +1086,8 @@ litellm_settings:

 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 

-
 ## Logging LLM IO to Galileo
+
 [BETA]

 Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
@ -1116,6 +1110,7 @@ export GALILEO_PASSWORD=""
 ### Quick Start 

 1. Add to Config.yaml
+
 ```yaml
 model_list:
 - litellm_params:
@ -1151,7 +1146,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 '
 ```

-
 🎉 That's it - Expect to see your Logs on your Galileo Dashboard

 ## Logging Proxy Cost + Usage - OpenMeter
@ -1169,6 +1163,7 @@ export OPENMETER_API_KEY=""
 ### Quick Start 

 1. Add to Config.yaml
+
 ```yaml
 model_list:
 - litellm_params:
@ -1204,13 +1199,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 '
 ```

-
 <Image img={require('../../img/openmeter_img_2.png')} />

 ## Logging Proxy Input/Output - DataDog
+
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

 **Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1230,6 +1226,7 @@ DD_SITE="us5.datadoghq.com"       # your datadog base url
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```
@ -1257,10 +1254,10 @@ Expected output on Datadog

 <Image img={require('../../img/dd_small1.png')} />

-
 ## Logging Proxy Input/Output - s3 Buckets

 We will use the `--config` to set 
+
 - `litellm.success_callback = ["s3"]` 

 This will log all successfull LLM calls to s3 Bucket
@ -1274,6 +1271,7 @@ AWS_REGION_NAME = ""
 ```

 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1293,11 +1291,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1317,6 +1317,7 @@ Your logs should be available on the specified s3 Bucket
 ## Logging Proxy Input/Output - DynamoDB

 We will use the `--config` to set 
+
 - `litellm.success_callback = ["dynamodb"]` 
 - `litellm.dynamodb_table_name = "your-table-name"`

@ -1331,6 +1332,7 @@ AWS_REGION_NAME = ""
 ```

 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
@ -1344,11 +1346,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1436,19 +1440,18 @@ Your logs should be available on DynamoDB
 }
 ```

-
-
-
 ## Logging Proxy Input/Output - Sentry

 If api calls fail (llm/database) you can log those to Sentry: 

 **Step 1** Install Sentry
+
 ```shell
 pip install --upgrade sentry-sdk
 ```

 **Step 2**: Save your Sentry_DSN and add `litellm_settings`: `failure_callback`
+
 ```shell
 export SENTRY_DSN="your-sentry-dsn"
 ```
@ -1468,11 +1471,13 @@ general_settings:
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```
 litellm --test
 ```
@ -1490,6 +1495,7 @@ ATHINA_API_KEY = "your-athina-api-key"
 ```

 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -1502,11 +1508,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1538,6 +1546,7 @@ AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
 ```

 **Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
 ```yaml
 model_list:
  - model_name: gpt-3.5-turbo
@ -1553,11 +1562,13 @@ litellm_settings:
 **Step 3**: Start the proxy, make a test request

 Start proxy
+
 ```shell
 litellm --config config.yaml --debug
 ```

 Test Request
+
 ```
 curl --location 'http://0.0.0.0:4000/chat/completions' \
    --header 'Content-Type: application/json' \
@ -1573,7 +1584,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 ```

 An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
-The details of the response will describe :
+The details of the response will describe:
+
 - The `source` : input text or llm generated text
 - The `category` : the category of the content that triggered the moderation
 - The `severity` : the severity from 0 to 10
--- a/docs/my-website/docs/proxy/model_management.md
+++ b/docs/my-website/docs/proxy/model_management.md
@ -15,9 +15,9 @@ model_list:
      metadata: "here's additional metadata on the model" # returned via GET /model/info
 ```

-## Get Model Information
+## Get Model Information - `/model/info`

-Retrieve detailed information about each model listed in the `/models` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
+Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.

 <Tabs
  defaultValue="curl"
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [OLD PROXY 👉 [**NEW** proxy here](./simple_proxy)] Local OpenAI Proxy Server
+# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server

 A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. 

--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -117,6 +117,7 @@ const sidebars = {
        "text_to_speech",
        "assistants",
        "batches",
+        "anthropic_completion"
      ],
    },
    {
@ -237,6 +238,7 @@ const sidebars = {
      label: "Extras",
      items: [
        "extras/contributing",
+        "data_security",
        "contributing",
        "rules",
        "proxy_server",
--- a/index.yaml
+++ b/index.yaml
@ -1,6 +1,25 @@
 apiVersion: v1
 entries:
  litellm-helm:
+  - apiVersion: v2
+    appVersion: v1.41.8
+    created: "2024-07-10T00:59:11.1889+08:00"
+    dependencies:
+    - condition: db.deployStandalone
+      name: postgresql
+      repository: oci://registry-1.docker.io/bitnamicharts
+      version: '>=13.3.0'
+    - condition: redis.enabled
+      name: redis
+      repository: oci://registry-1.docker.io/bitnamicharts
+      version: '>=18.0.0'
+    description: Call all LLM APIs using the OpenAI format
+    digest: eeff5e4e6cebb4c977cb7359c1ec6c773c66982f6aa39dbed94a674890144a43
+    name: litellm-helm
+    type: application
+    urls:
+    - https://berriai.github.io/litellm/litellm-helm-0.2.1.tgz
+    version: 0.2.1
  - apiVersion: v2
    appVersion: v1.35.38
    created: "2024-05-06T10:22:24.384392-07:00"
@ -33,7 +52,7 @@ entries:
      licenses: Apache-2.0
    apiVersion: v2
    appVersion: 16.2.0
-    created: "2024-05-06T10:22:24.387717-07:00"
+    created: "2024-07-10T00:59:11.191731+08:00"
    dependencies:
    - name: common
      repository: oci://registry-1.docker.io/bitnamicharts
@ -60,7 +79,7 @@ entries:
    sources:
    - https://github.com/bitnami/charts/tree/main/bitnami/postgresql
    urls:
-    - charts/postgresql-14.3.1.tgz
+    - https://berriai.github.io/litellm/charts/postgresql-14.3.1.tgz
    version: 14.3.1
  redis:
  - annotations:
@ -79,7 +98,7 @@ entries:
      licenses: Apache-2.0
    apiVersion: v2
    appVersion: 7.2.4
-    created: "2024-05-06T10:22:24.391903-07:00"
+    created: "2024-07-10T00:59:11.195667+08:00"
    dependencies:
    - name: common
      repository: oci://registry-1.docker.io/bitnamicharts
@ -103,6 +122,6 @@ entries:
    sources:
    - https://github.com/bitnami/charts/tree/main/bitnami/redis
    urls:
-    - charts/redis-18.19.1.tgz
+    - https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
    version: 18.19.1
-generated: "2024-05-06T10:22:24.375026-07:00"
+generated: "2024-07-10T00:59:11.179952+08:00"
--- a/litellm-helm-0.2.1.tgz
+++ b/litellm-helm-0.2.1.tgz
--- a/litellm/init.py
+++ b/litellm/init.py
@ -364,7 +364,7 @@ for key, value in model_cost.items():
    elif value.get("litellm_provider") == "mistral":
        mistral_chat_models.append(key)
    elif value.get("litellm_provider") == "anthropic":
-        anthropic_models.append(key)    
+        anthropic_models.append(key)
    elif value.get("litellm_provider") == "empower":
        empower_models.append(key)
    elif value.get("litellm_provider") == "openrouter":
@ -789,6 +789,7 @@ from .utils import (
    get_api_base,
    get_first_chars_messages,
    ModelResponse,
+    EmbeddingResponse,
    ImageResponse,
    get_provider_fields,
 )
@ -879,5 +880,11 @@ from .proxy.proxy_cli import run_server
 from .router import Router
 from .assistants.main import *
 from .batches.main import *
+from .files.main import *
 from .scheduler import *
 from .cost_calculator import response_cost_calculator, cost_per_token
+
+### ADAPTERS ###
+from .types.adapter import AdapterItem
+
+adapters: List[AdapterItem] = []
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -0,0 +1,50 @@
+# What is this?
+## Translates OpenAI call to Anthropic `/v1/messages` format
+import json
+import os
+import traceback
+import uuid
+from typing import Literal, Optional
+
+import dotenv
+import httpx
+from pydantic import BaseModel
+
+import litellm
+from litellm import ChatCompletionRequest, verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
+
+
+class AnthropicAdapter(CustomLogger):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def translate_completion_input_params(
+        self, kwargs
+    ) -> Optional[ChatCompletionRequest]:
+        """
+        - translate params, where needed
+        - pass rest, as is
+        """
+        request_body = AnthropicMessagesRequest(**kwargs)  # type: ignore
+
+        translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
+            anthropic_message_request=request_body
+        )
+
+        return translated_body
+
+    def translate_completion_output_params(
+        self, response: litellm.ModelResponse
+    ) -> Optional[AnthropicResponse]:
+
+        return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
+            response=response
+        )
+
+    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
+        return super().translate_completion_output_params_streaming()
+
+
+anthropic_adapter = AnthropicAdapter()
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -10,296 +10,37 @@ https://platform.openai.com/docs/api-reference/batch

 """

-import os
 import asyncio
-from functools import partial
 import contextvars
-from typing import Literal, Optional, Dict, Coroutine, Any, Union
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union
+
 import httpx

 import litellm
 from litellm import client
 from litellm.utils import supports_httpx_timeout
-from ..types.router import *
+
 from ..llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
 from ..types.llms.openai import (
-    CreateBatchRequest,
-    RetrieveBatchRequest,
-    CancelBatchRequest,
-    CreateFileRequest,
-    FileTypes,
-    FileObject,
    Batch,
+    CancelBatchRequest,
+    CreateBatchRequest,
+    CreateFileRequest,
    FileContentRequest,
+    FileObject,
+    FileTypes,
    HttpxBinaryResponseContent,
+    RetrieveBatchRequest,
 )
+from ..types.router import *

 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
-openai_files_instance = OpenAIFilesAPI()
 #################################################


-async def acreate_file(
-    file: FileTypes,
-    purpose: Literal["assistants", "batch", "fine-tune"],
-    custom_llm_provider: Literal["openai"] = "openai",
-    extra_headers: Optional[Dict[str, str]] = None,
-    extra_body: Optional[Dict[str, str]] = None,
-    **kwargs,
-) -> Coroutine[Any, Any, FileObject]:
-    """
-    Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
-
-    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
-    """
-    try:
-        loop = asyncio.get_event_loop()
-        kwargs["acreate_file"] = True
-
-        # Use a partial function to pass your keyword arguments
-        func = partial(
-            create_file,
-            file,
-            purpose,
-            custom_llm_provider,
-            extra_headers,
-            extra_body,
-            **kwargs,
-        )
-
-        # Add the context to the function
-        ctx = contextvars.copy_context()
-        func_with_context = partial(ctx.run, func)
-        init_response = await loop.run_in_executor(None, func_with_context)
-        if asyncio.iscoroutine(init_response):
-            response = await init_response
-        else:
-            response = init_response  # type: ignore
-
-        return response
-    except Exception as e:
-        raise e
-
-
-def create_file(
-    file: FileTypes,
-    purpose: Literal["assistants", "batch", "fine-tune"],
-    custom_llm_provider: Literal["openai"] = "openai",
-    extra_headers: Optional[Dict[str, str]] = None,
-    extra_body: Optional[Dict[str, str]] = None,
-    **kwargs,
-) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
-    """
-    Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
-
-    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
-    """
-    try:
-        optional_params = GenericLiteLLMParams(**kwargs)
-        if custom_llm_provider == "openai":
-            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
-            api_base = (
-                optional_params.api_base
-                or litellm.api_base
-                or os.getenv("OPENAI_API_BASE")
-                or "https://api.openai.com/v1"
-            )
-            organization = (
-                optional_params.organization
-                or litellm.organization
-                or os.getenv("OPENAI_ORGANIZATION", None)
-                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
-            )
-            # set API KEY
-            api_key = (
-                optional_params.api_key
-                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
-                or litellm.openai_key
-                or os.getenv("OPENAI_API_KEY")
-            )
-            ### TIMEOUT LOGIC ###
-            timeout = (
-                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-            )
-            # set timeout for 10 minutes by default
-
-            if (
-                timeout is not None
-                and isinstance(timeout, httpx.Timeout)
-                and supports_httpx_timeout(custom_llm_provider) == False
-            ):
-                read_timeout = timeout.read or 600
-                timeout = read_timeout  # default 10 min timeout
-            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
-                timeout = float(timeout)  # type: ignore
-            elif timeout is None:
-                timeout = 600.0
-
-            _create_file_request = CreateFileRequest(
-                file=file,
-                purpose=purpose,
-                extra_headers=extra_headers,
-                extra_body=extra_body,
-            )
-
-            _is_async = kwargs.pop("acreate_file", False) is True
-
-            response = openai_files_instance.create_file(
-                _is_async=_is_async,
-                api_base=api_base,
-                api_key=api_key,
-                timeout=timeout,
-                max_retries=optional_params.max_retries,
-                organization=organization,
-                create_file_data=_create_file_request,
-            )
-        else:
-            raise litellm.exceptions.BadRequestError(
-                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
-                    custom_llm_provider
-                ),
-                model="n/a",
-                llm_provider=custom_llm_provider,
-                response=httpx.Response(
-                    status_code=400,
-                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
-                ),
-            )
-        return response
-    except Exception as e:
-        raise e
-
-
-async def afile_content(
-    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
-    extra_headers: Optional[Dict[str, str]] = None,
-    extra_body: Optional[Dict[str, str]] = None,
-    **kwargs,
-) -> Coroutine[Any, Any, HttpxBinaryResponseContent]:
-    """
-    Async: Get file contents
-
-    LiteLLM Equivalent of GET https://api.openai.com/v1/files
-    """
-    try:
-        loop = asyncio.get_event_loop()
-        kwargs["afile_content"] = True
-
-        # Use a partial function to pass your keyword arguments
-        func = partial(
-            file_content,
-            file_id,
-            custom_llm_provider,
-            extra_headers,
-            extra_body,
-            **kwargs,
-        )
-
-        # Add the context to the function
-        ctx = contextvars.copy_context()
-        func_with_context = partial(ctx.run, func)
-        init_response = await loop.run_in_executor(None, func_with_context)
-        if asyncio.iscoroutine(init_response):
-            response = await init_response
-        else:
-            response = init_response  # type: ignore
-
-        return response
-    except Exception as e:
-        raise e
-
-
-def file_content(
-    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
-    extra_headers: Optional[Dict[str, str]] = None,
-    extra_body: Optional[Dict[str, str]] = None,
-    **kwargs,
-) -> Union[HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]]:
-    """
-    Returns the contents of the specified file.
-
-    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
-    """
-    try:
-        optional_params = GenericLiteLLMParams(**kwargs)
-        if custom_llm_provider == "openai":
-            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
-            api_base = (
-                optional_params.api_base
-                or litellm.api_base
-                or os.getenv("OPENAI_API_BASE")
-                or "https://api.openai.com/v1"
-            )
-            organization = (
-                optional_params.organization
-                or litellm.organization
-                or os.getenv("OPENAI_ORGANIZATION", None)
-                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
-            )
-            # set API KEY
-            api_key = (
-                optional_params.api_key
-                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
-                or litellm.openai_key
-                or os.getenv("OPENAI_API_KEY")
-            )
-            ### TIMEOUT LOGIC ###
-            timeout = (
-                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-            )
-            # set timeout for 10 minutes by default
-
-            if (
-                timeout is not None
-                and isinstance(timeout, httpx.Timeout)
-                and supports_httpx_timeout(custom_llm_provider) == False
-            ):
-                read_timeout = timeout.read or 600
-                timeout = read_timeout  # default 10 min timeout
-            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
-                timeout = float(timeout)  # type: ignore
-            elif timeout is None:
-                timeout = 600.0
-
-            _file_content_request = FileContentRequest(
-                file_id=file_id,
-                extra_headers=extra_headers,
-                extra_body=extra_body,
-            )
-
-            _is_async = kwargs.pop("afile_content", False) is True
-
-            response = openai_files_instance.file_content(
-                _is_async=_is_async,
-                file_content_request=_file_content_request,
-                api_base=api_base,
-                api_key=api_key,
-                timeout=timeout,
-                max_retries=optional_params.max_retries,
-                organization=organization,
-            )
-        else:
-            raise litellm.exceptions.BadRequestError(
-                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
-                    custom_llm_provider
-                ),
-                model="n/a",
-                llm_provider=custom_llm_provider,
-                response=httpx.Response(
-                    status_code=400,
-                    content="Unsupported provider",
-                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
-                ),
-            )
-        return response
-    except Exception as e:
-        raise e
-
-
 async def acreate_batch(
    completion_window: Literal["24h"],
    endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -15,10 +15,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
 from litellm.litellm_core_utils.llm_cost_calc.google import (
    cost_per_token as google_cost_per_token,
 )
+from litellm.litellm_core_utils.llm_cost_calc.google import (
+    cost_router as google_cost_router,
+)
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
-
 from litellm.utils import (
    CallTypes,
    CostPerToken,
@ -160,22 +162,32 @@ def cost_per_token(

    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
    print_verbose(f"Looking up model={model} in model_cost_map")
-    if custom_llm_provider == "vertex_ai" and "claude" in model:
-        return google_cost_per_token(
-            model=model_without_prefix,
-            custom_llm_provider=custom_llm_provider,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-        )
    if custom_llm_provider == "vertex_ai":
-        return google_cost_per_character(
+        cost_router = google_cost_router(
            model=model_without_prefix,
            custom_llm_provider=custom_llm_provider,
            prompt_characters=prompt_characters,
            completion_characters=completion_characters,
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
+            call_type=call_type,
        )
+        if cost_router == "cost_per_character":
+            return google_cost_per_character(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_characters=prompt_characters,
+                completion_characters=completion_characters,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
+        elif cost_router == "cost_per_token":
+            return google_cost_per_token(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
    elif custom_llm_provider == "gemini":
        return google_cost_per_token(
            model=model_without_prefix,
--- a/litellm/files/main.py
+++ b/litellm/files/main.py
@ -0,0 +1,659 @@
+"""
+Main File for Files API implementation
+
+https://platform.openai.com/docs/api-reference/files
+
+"""
+
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union
+
+import httpx
+
+import litellm
+from litellm import client
+from litellm.llms.openai import FileDeleted, FileObject, OpenAIFilesAPI
+from litellm.types.llms.openai import (
+    Batch,
+    CreateFileRequest,
+    FileContentRequest,
+    FileTypes,
+    HttpxBinaryResponseContent,
+)
+from litellm.types.router import *
+from litellm.utils import supports_httpx_timeout
+
+####### ENVIRONMENT VARIABLES ###################
+openai_files_instance = OpenAIFilesAPI()
+#################################################
+
+
+async def afile_retrieve(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Coroutine[Any, Any, FileObject]:
+    """
+    Async: Get file contents
+
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_retrieve,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def file_retrieve(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FileObject:
+    """
+    Returns the contents of the specified file.
+
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _is_async = kwargs.pop("is_async", False) is True
+
+            response = openai_files_instance.retrieve_file(
+                file_id=file_id,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+
+
+# Delete file
+async def afile_delete(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Coroutine[Any, Any, FileObject]:
+    """
+    Async: Delete file
+
+    LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_delete,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def file_delete(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FileDeleted:
+    """
+    Delete file
+
+    LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _is_async = kwargs.pop("is_async", False) is True
+
+            response = openai_files_instance.delete_file(
+                file_id=file_id,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+
+
+# List files
+async def afile_list(
+    custom_llm_provider: Literal["openai"] = "openai",
+    purpose: Optional[str] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Async: List files
+
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_list,
+            custom_llm_provider,
+            purpose,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def file_list(
+    custom_llm_provider: Literal["openai"] = "openai",
+    purpose: Optional[str] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    List files
+
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _is_async = kwargs.pop("is_async", False) is True
+
+            response = openai_files_instance.list_files(
+                purpose=purpose,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'file_list'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="file_list", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+
+
+async def acreate_file(
+    file: FileTypes,
+    purpose: Literal["assistants", "batch", "fine-tune"],
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Coroutine[Any, Any, FileObject]:
+    """
+    Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
+
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acreate_file"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            create_file,
+            file,
+            purpose,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def create_file(
+    file: FileTypes,
+    purpose: Literal["assistants", "batch", "fine-tune"],
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
+    """
+    Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
+
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _create_file_request = CreateFileRequest(
+                file=file,
+                purpose=purpose,
+                extra_headers=extra_headers,
+                extra_body=extra_body,
+            )
+
+            _is_async = kwargs.pop("acreate_file", False) is True
+
+            response = openai_files_instance.create_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+                create_file_data=_create_file_request,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+
+
+async def afile_content(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Coroutine[Any, Any, HttpxBinaryResponseContent]:
+    """
+    Async: Get file contents
+
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["afile_content"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_content,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def file_content(
+    file_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]]:
+    """
+    Returns the contents of the specified file.
+
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _file_content_request = FileContentRequest(
+                file_id=file_id,
+                extra_headers=extra_headers,
+                extra_body=extra_body,
+            )
+
+            _is_async = kwargs.pop("afile_content", False) is True
+
+            response = openai_files_instance.file_content(
+                _is_async=_is_async,
+                file_content_request=_file_content_request,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -5,9 +5,12 @@ import traceback
 from typing import Literal, Optional, Union

 import dotenv
+from pydantic import BaseModel

 from litellm.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
+from litellm.types.llms.openai import ChatCompletionRequest
+from litellm.types.utils import ModelResponse


 class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
@ -55,6 +58,30 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
    def pre_call_check(self, deployment: dict) -> Optional[dict]:
        pass

+    #### ADAPTERS #### Allow calling 100+ LLMs in custom format - https://github.com/BerriAI/litellm/pulls
+
+    def translate_completion_input_params(
+        self, kwargs
+    ) -> Optional[ChatCompletionRequest]:
+        """
+        Translates the input params, from the provider's native format to the litellm.completion() format.
+        """
+        pass
+
+    def translate_completion_output_params(
+        self, response: ModelResponse
+    ) -> Optional[BaseModel]:
+        """
+        Translates the output params, from the OpenAI format to the custom format.
+        """
+        pass
+
+    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
+        """
+        Translates the streaming chunk, from the OpenAI format to the custom format.
+        """
+        pass
+
    #### CALL HOOKS - proxy only ####
    """
    Control the modify incoming / outgoung data before calling the model
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -326,7 +326,12 @@ class LangFuseLogger:
                    or isinstance(value, int)
                    or isinstance(value, float)
                ):
-                    new_metadata[key] = copy.deepcopy(value)
+                    try:
+                        new_metadata[key] = copy.deepcopy(value)
+                    except Exception as e:
+                        verbose_logger.error(
+                            f"Langfuse [Non-blocking error] - error copying metadata: {str(e)}"
+                        )
            metadata = new_metadata

            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -52,6 +52,12 @@ class OpenTelemetryConfig:

        OTEL_HEADERS gets sent as headers = {"x-honeycomb-team": "B85YgLm96******"}
        """
+        from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+            InMemorySpanExporter,
+        )
+
+        if os.getenv("OTEL_EXPORTER") == "in_memory":
+            return cls(exporter=InMemorySpanExporter())
        return cls(
            exporter=os.getenv("OTEL_EXPORTER", "console"),
            endpoint=os.getenv("OTEL_ENDPOINT"),
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -675,7 +675,7 @@ class SlackAlerting(CustomLogger):
    async def failed_tracking_alert(self, error_message: str):
        """Raise alert when tracking failed for specific model"""
        _cache: DualCache = self.internal_usage_cache
-        message = "Failed Tracking Cost for" + error_message
+        message = "Failed Tracking Cost for " + error_message
        _cache_key = "budget_alerts:failed_tracking:{}".format(message)
        result = await _cache.async_get_cache(key=_cache_key)
        if result is None:
@ -1530,15 +1530,19 @@ Model Info:
        """Log deployment latency"""
        try:
            if "daily_reports" in self.alert_types:
-                model_id = (
-                    kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
-                )
+                litellm_params = kwargs.get("litellm_params", {}) or {}
+                model_info = litellm_params.get("model_info", {}) or {}
+                model_id = model_info.get("id", "") or ""
                response_s: timedelta = end_time - start_time

                final_value = response_s
                total_tokens = 0

-                if isinstance(response_obj, litellm.ModelResponse):
+                if isinstance(response_obj, litellm.ModelResponse) and (
+                    hasattr(response_obj, "usage")
+                    and response_obj.usage is not None
+                    and hasattr(response_obj.usage, "completion_tokens")
+                ):
                    completion_tokens = response_obj.usage.completion_tokens
                    if completion_tokens is not None and completion_tokens > 0:
                        final_value = float(
@ -1557,8 +1561,7 @@ Model Info:
                )
        except Exception as e:
            verbose_proxy_logger.error(
-                "[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ",
-                e,
+                f"[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: {str(e)}"
            )
            pass

--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -1275,7 +1275,7 @@ class Logging:
                    f"Model={self.model}; cost={self.model_call_details['response_cost']}"
                )
            except litellm.NotFoundError as e:
-                verbose_logger.error(
+                verbose_logger.warning(
                    f"Model={self.model} not found in completion cost map. Setting 'response_cost' to None"
                )
                self.model_call_details["response_cost"] = None
--- a/litellm/litellm_core_utils/llm_cost_calc/google.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/google.py
@ -1,7 +1,7 @@
 # What is this?
 ## Cost calculation for Google AI Studio / Vertex AI models
 import traceback
-from typing import List, Literal, Optional, Tuple
+from typing import List, Literal, Optional, Tuple, Union

 import litellm
 from litellm import verbose_logger
@ -29,6 +29,32 @@ def _is_above_128k(tokens: float) -> bool:
    return False


+def cost_router(
+    model: str,
+    custom_llm_provider: str,
+    prompt_tokens: float,
+    completion_tokens: float,
+    prompt_characters: float,
+    completion_characters: float,
+    call_type: Union[Literal["embedding", "aembedding"], str],
+) -> Literal["cost_per_character", "cost_per_token"]:
+    """
+    Route the cost calc to the right place, based on model/call_type/etc.
+
+    Returns
+        - str, the specific google cost calc function it should route to.
+    """
+    if custom_llm_provider == "vertex_ai" and "claude" in model:
+        return "cost_per_token"
+    elif custom_llm_provider == "gemini":
+        return "cost_per_token"
+    elif custom_llm_provider == "vertex_ai" and (
+        call_type == "embedding" or call_type == "aembedding"
+    ):
+        return "cost_per_token"
+    return "cost_per_character"
+
+
 def cost_per_character(
    model: str,
    custom_llm_provider: str,
--- a/litellm/llms/ai21.py
+++ b/litellm/llms/ai21.py
@ -1,11 +1,16 @@
-import os, types, traceback
 import json
+import os
+import time  # type: ignore
+import traceback
+import types
 from enum import Enum
-import requests  # type: ignore
-import time, httpx  # type: ignore
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Choices, Message
+
+import httpx
+import requests  # type: ignore
+
 import litellm
+from litellm.utils import Choices, Message, ModelResponse


 class AI21Error(Exception):
@ -185,7 +190,7 @@ def completion(
                    message=message_obj,
                )
                choices_list.append(choice_obj)
-            model_response["choices"] = choices_list
+            model_response.choices = choices_list  # type: ignore
        except Exception as e:
            raise AI21Error(
                message=traceback.format_exc(), status_code=response.status_code
@ -197,13 +202,17 @@ def completion(
            encoding.encode(model_response["choices"][0]["message"].get("content"))
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
-        model_response["usage"] = {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        model_response.created = int(time.time())
+        model_response.model = model
+        setattr(
+            model_response,
+            "usage",
+            litellm.Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=prompt_tokens + completion_tokens,
+            ),
+        )
        return model_response


--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -1,12 +1,15 @@
-import os, types
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
-import litellm
-from litellm.utils import ModelResponse, Choices, Message, Usage
+
 import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.utils import Choices, Message, ModelResponse, Usage


 class AlephAlphaError(Exception):
@ -275,7 +278,7 @@ def completion(
                        message=message_obj,
                    )
                    choices_list.append(choice_obj)
-                model_response["choices"] = choices_list
+                model_response.choices = choices_list  # type: ignore
            except:
                raise AlephAlphaError(
                    message=json.dumps(completion_response),
@ -291,8 +294,8 @@ def completion(
            )
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -20,19 +20,43 @@ from litellm.llms.custom_httpx.http_handler import (
    _get_httpx_client,
 )
 from litellm.types.llms.anthropic import (
+    AnthopicMessagesAssistantMessageParam,
+    AnthropicFinishReason,
+    AnthropicMessagesRequest,
+    AnthropicMessagesTool,
    AnthropicMessagesToolChoice,
+    AnthropicMessagesUserMessageParam,
+    AnthropicResponse,
+    AnthropicResponseContentBlockText,
+    AnthropicResponseContentBlockToolUse,
+    AnthropicResponseUsageBlock,
    ContentBlockDelta,
    ContentBlockStart,
    MessageBlockDelta,
    MessageStartBlock,
 )
 from litellm.types.llms.openai import (
+    AllMessageValues,
+    ChatCompletionAssistantMessage,
+    ChatCompletionAssistantToolCall,
+    ChatCompletionImageObject,
+    ChatCompletionImageUrlObject,
+    ChatCompletionRequest,
    ChatCompletionResponseMessage,
+    ChatCompletionSystemMessage,
+    ChatCompletionTextObject,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionToolChoiceFunctionParam,
+    ChatCompletionToolChoiceObjectParam,
+    ChatCompletionToolChoiceValues,
+    ChatCompletionToolMessage,
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
    ChatCompletionUsageBlock,
+    ChatCompletionUserMessage,
 )
-from litellm.types.utils import GenericStreamingChunk
+from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage

 from .base import BaseLLM
@ -168,6 +192,287 @@ class AnthropicConfig:
                optional_params["top_p"] = value
        return optional_params

+    ### FOR [BETA] `/v1/messages` endpoint support
+
+    def translatable_anthropic_params(self) -> List:
+        """
+        Which anthropic params, we need to translate to the openai format.
+        """
+        return ["messages", "metadata", "system", "tool_choice", "tools"]
+
+    def translate_anthropic_messages_to_openai(
+        self,
+        messages: List[
+            Union[
+                AnthropicMessagesUserMessageParam,
+                AnthopicMessagesAssistantMessageParam,
+            ]
+        ],
+    ) -> List:
+        new_messages: List[AllMessageValues] = []
+        for m in messages:
+            user_message: Optional[ChatCompletionUserMessage] = None
+            tool_message_list: List[ChatCompletionToolMessage] = []
+            ## USER MESSAGE ##
+            if m["role"] == "user":
+                ## translate user message
+                if isinstance(m["content"], str):
+                    user_message = ChatCompletionUserMessage(
+                        role="user", content=m["content"]
+                    )
+                elif isinstance(m["content"], list):
+                    new_user_content_list: List[
+                        Union[ChatCompletionTextObject, ChatCompletionImageObject]
+                    ] = []
+                    for content in m["content"]:
+                        if content["type"] == "text":
+                            text_obj = ChatCompletionTextObject(
+                                type="text", text=content["text"]
+                            )
+                            new_user_content_list.append(text_obj)
+                        elif content["type"] == "image":
+                            image_url = ChatCompletionImageUrlObject(
+                                url=f"data:{content['type']};base64,{content['source']}"
+                            )
+                            image_obj = ChatCompletionImageObject(
+                                type="image_url", image_url=image_url
+                            )
+
+                            new_user_content_list.append(image_obj)
+                        elif content["type"] == "tool_result":
+                            if "content" not in content:
+                                tool_result = ChatCompletionToolMessage(
+                                    role="tool",
+                                    tool_call_id=content["tool_use_id"],
+                                    content="",
+                                )
+                                tool_message_list.append(tool_result)
+                            elif isinstance(content["content"], str):
+                                tool_result = ChatCompletionToolMessage(
+                                    role="tool",
+                                    tool_call_id=content["tool_use_id"],
+                                    content=content["content"],
+                                )
+                                tool_message_list.append(tool_result)
+                            elif isinstance(content["content"], list):
+                                for c in content["content"]:
+                                    if c["type"] == "text":
+                                        tool_result = ChatCompletionToolMessage(
+                                            role="tool",
+                                            tool_call_id=content["tool_use_id"],
+                                            content=c["text"],
+                                        )
+                                        tool_message_list.append(tool_result)
+                                    elif c["type"] == "image":
+                                        image_str = (
+                                            f"data:{c['type']};base64,{c['source']}"
+                                        )
+                                        tool_result = ChatCompletionToolMessage(
+                                            role="tool",
+                                            tool_call_id=content["tool_use_id"],
+                                            content=image_str,
+                                        )
+                                        tool_message_list.append(tool_result)
+
+            if user_message is not None:
+                new_messages.append(user_message)
+
+            if len(tool_message_list) > 0:
+                new_messages.extend(tool_message_list)
+
+            ## ASSISTANT MESSAGE ##
+            assistant_message_str: Optional[str] = None
+            tool_calls: List[ChatCompletionAssistantToolCall] = []
+            if m["role"] == "assistant":
+                if isinstance(m["content"], str):
+                    assistant_message_str = m["content"]
+                elif isinstance(m["content"], list):
+                    for content in m["content"]:
+                        if content["type"] == "text":
+                            if assistant_message_str is None:
+                                assistant_message_str = content["text"]
+                            else:
+                                assistant_message_str += content["text"]
+                        elif content["type"] == "tool_use":
+                            function_chunk = ChatCompletionToolCallFunctionChunk(
+                                name=content["name"],
+                                arguments=json.dumps(content["input"]),
+                            )
+
+                            tool_calls.append(
+                                ChatCompletionAssistantToolCall(
+                                    id=content["id"],
+                                    type="function",
+                                    function=function_chunk,
+                                )
+                            )
+
+            if assistant_message_str is not None or len(tool_calls) > 0:
+                assistant_message = ChatCompletionAssistantMessage(
+                    role="assistant",
+                    content=assistant_message_str,
+                )
+                if len(tool_calls) > 0:
+                    assistant_message["tool_calls"] = tool_calls
+                new_messages.append(assistant_message)
+
+        return new_messages
+
+    def translate_anthropic_tool_choice_to_openai(
+        self, tool_choice: AnthropicMessagesToolChoice
+    ) -> ChatCompletionToolChoiceValues:
+        if tool_choice["type"] == "any":
+            return "required"
+        elif tool_choice["type"] == "auto":
+            return "auto"
+        elif tool_choice["type"] == "tool":
+            tc_function_param = ChatCompletionToolChoiceFunctionParam(
+                name=tool_choice.get("name", "")
+            )
+            return ChatCompletionToolChoiceObjectParam(
+                type="function", function=tc_function_param
+            )
+        else:
+            raise ValueError(
+                "Incompatible tool choice param submitted - {}".format(tool_choice)
+            )
+
+    def translate_anthropic_tools_to_openai(
+        self, tools: List[AnthropicMessagesTool]
+    ) -> List[ChatCompletionToolParam]:
+        new_tools: List[ChatCompletionToolParam] = []
+        for tool in tools:
+            function_chunk = ChatCompletionToolParamFunctionChunk(
+                name=tool["name"],
+                parameters=tool["input_schema"],
+            )
+            if "description" in tool:
+                function_chunk["description"] = tool["description"]
+            new_tools.append(
+                ChatCompletionToolParam(type="function", function=function_chunk)
+            )
+
+        return new_tools
+
+    def translate_anthropic_to_openai(
+        self, anthropic_message_request: AnthropicMessagesRequest
+    ) -> ChatCompletionRequest:
+        """
+        This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
+        """
+        new_messages: List[AllMessageValues] = []
+
+        ## CONVERT ANTHROPIC MESSAGES TO OPENAI
+        new_messages = self.translate_anthropic_messages_to_openai(
+            messages=anthropic_message_request["messages"]
+        )
+        ## ADD SYSTEM MESSAGE TO MESSAGES
+        if "system" in anthropic_message_request:
+            new_messages.insert(
+                0,
+                ChatCompletionSystemMessage(
+                    role="system", content=anthropic_message_request["system"]
+                ),
+            )
+
+        new_kwargs: ChatCompletionRequest = {
+            "model": anthropic_message_request["model"],
+            "messages": new_messages,
+        }
+        ## CONVERT METADATA (user_id)
+        if "metadata" in anthropic_message_request:
+            if "user_id" in anthropic_message_request["metadata"]:
+                new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
+
+        ## CONVERT TOOL CHOICE
+        if "tool_choice" in anthropic_message_request:
+            new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
+                tool_choice=anthropic_message_request["tool_choice"]
+            )
+        ## CONVERT TOOLS
+        if "tools" in anthropic_message_request:
+            new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
+                tools=anthropic_message_request["tools"]
+            )
+
+        translatable_params = self.translatable_anthropic_params()
+        for k, v in anthropic_message_request.items():
+            if k not in translatable_params:  # pass remaining params as is
+                new_kwargs[k] = v  # type: ignore
+
+        return new_kwargs
+
+    def _translate_openai_content_to_anthropic(
+        self, choices: List[Choices]
+    ) -> List[
+        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
+    ]:
+        new_content: List[
+            Union[
+                AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
+            ]
+        ] = []
+        for choice in choices:
+            if (
+                choice.message.tool_calls is not None
+                and len(choice.message.tool_calls) > 0
+            ):
+                for tool_call in choice.message.tool_calls:
+                    new_content.append(
+                        AnthropicResponseContentBlockToolUse(
+                            type="tool_use",
+                            id=tool_call.id,
+                            name=tool_call.function.name or "",
+                            input=json.loads(tool_call.function.arguments),
+                        )
+                    )
+            elif choice.message.content is not None:
+                new_content.append(
+                    AnthropicResponseContentBlockText(
+                        type="text", text=choice.message.content
+                    )
+                )
+
+        return new_content
+
+    def _translate_openai_finish_reason_to_anthropic(
+        self, openai_finish_reason: str
+    ) -> AnthropicFinishReason:
+        if openai_finish_reason == "stop":
+            return "end_turn"
+        elif openai_finish_reason == "length":
+            return "max_tokens"
+        elif openai_finish_reason == "tool_calls":
+            return "tool_use"
+        return "end_turn"
+
+    def translate_openai_response_to_anthropic(
+        self, response: litellm.ModelResponse
+    ) -> AnthropicResponse:
+        ## translate content block
+        anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices)  # type: ignore
+        ## extract finish reason
+        anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
+            openai_finish_reason=response.choices[0].finish_reason  # type: ignore
+        )
+        # extract usage
+        usage: litellm.Usage = getattr(response, "usage")
+        anthropic_usage = AnthropicResponseUsageBlock(
+            input_tokens=usage.prompt_tokens, output_tokens=usage.completion_tokens
+        )
+        translated_obj = AnthropicResponse(
+            id=response.id,
+            type="message",
+            role="assistant",
+            model=response.model or "unknown-model",
+            stop_sequence=None,
+            usage=anthropic_usage,
+            content=anthropic_content,
+            stop_reason=anthropic_finish_reason,
+        )
+
+        return translated_obj
+

 # makes headers for API call
 def validate_environment(api_key, user_headers):
@ -231,121 +536,6 @@ class AnthropicChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()

-    # def process_streaming_response(
-    #     self,
-    #     model: str,
-    #     response: Union[requests.Response, httpx.Response],
-    #     model_response: ModelResponse,
-    #     stream: bool,
-    #     logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
-    #     optional_params: dict,
-    #     api_key: str,
-    #     data: Union[dict, str],
-    #     messages: List,
-    #     print_verbose,
-    #     encoding,
-    # ) -> CustomStreamWrapper:
-    #     """
-    #     Return stream object for tool-calling + streaming
-    #     """
-    #     ## LOGGING
-    #     logging_obj.post_call(
-    #         input=messages,
-    #         api_key=api_key,
-    #         original_response=response.text,
-    #         additional_args={"complete_input_dict": data},
-    #     )
-    #     print_verbose(f"raw model_response: {response.text}")
-    #     ## RESPONSE OBJECT
-    #     try:
-    #         completion_response = response.json()
-    #     except:
-    #         raise AnthropicError(
-    #             message=response.text, status_code=response.status_code
-    #         )
-    #     text_content = ""
-    #     tool_calls = []
-    #     for content in completion_response["content"]:
-    #         if content["type"] == "text":
-    #             text_content += content["text"]
-    #         ## TOOL CALLING
-    #         elif content["type"] == "tool_use":
-    #             tool_calls.append(
-    #                 {
-    #                     "id": content["id"],
-    #                     "type": "function",
-    #                     "function": {
-    #                         "name": content["name"],
-    #                         "arguments": json.dumps(content["input"]),
-    #                     },
-    #                 }
-    #             )
-    #     if "error" in completion_response:
-    #         raise AnthropicError(
-    #             message=str(completion_response["error"]),
-    #             status_code=response.status_code,
-    #         )
-    #     _message = litellm.Message(
-    #         tool_calls=tool_calls,
-    #         content=text_content or None,
-    #     )
-    #     model_response.choices[0].message = _message  # type: ignore
-    #     model_response._hidden_params["original_response"] = completion_response[
-    #         "content"
-    #     ]  # allow user to access raw anthropic tool calling response
-
-    #     model_response.choices[0].finish_reason = map_finish_reason(
-    #         completion_response["stop_reason"]
-    #     )
-
-    #     print_verbose("INSIDE ANTHROPIC STREAMING TOOL CALLING CONDITION BLOCK")
-    #     # return an iterator
-    #     streaming_model_response = ModelResponse(stream=True)
-    #     streaming_model_response.choices[0].finish_reason = model_response.choices[  # type: ignore
-    #         0
-    #     ].finish_reason
-    #     # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
-    #     streaming_choice = litellm.utils.StreamingChoices()
-    #     streaming_choice.index = model_response.choices[0].index
-    #     _tool_calls = []
-    #     print_verbose(
-    #         f"type of model_response.choices[0]: {type(model_response.choices[0])}"
-    #     )
-    #     print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
-    #     if isinstance(model_response.choices[0], litellm.Choices):
-    #         if getattr(
-    #             model_response.choices[0].message, "tool_calls", None
-    #         ) is not None and isinstance(
-    #             model_response.choices[0].message.tool_calls, list
-    #         ):
-    #             for tool_call in model_response.choices[0].message.tool_calls:
-    #                 _tool_call = {**tool_call.dict(), "index": 0}
-    #                 _tool_calls.append(_tool_call)
-    #         delta_obj = litellm.utils.Delta(
-    #             content=getattr(model_response.choices[0].message, "content", None),
-    #             role=model_response.choices[0].message.role,
-    #             tool_calls=_tool_calls,
-    #         )
-    #         streaming_choice.delta = delta_obj
-    #         streaming_model_response.choices = [streaming_choice]
-    #         completion_stream = ModelResponseIterator(
-    #             model_response=streaming_model_response
-    #         )
-    #         print_verbose(
-    #             "Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
-    #         )
-    #         return CustomStreamWrapper(
-    #             completion_stream=completion_stream,
-    #             model=model,
-    #             custom_llm_provider="cached_response",
-    #             logging_obj=logging_obj,
-    #         )
-    #     else:
-    #         raise AnthropicError(
-    #             status_code=422,
-    #             message="Unprocessable response object - {}".format(response.text),
-    #         )
-
    def process_response(
        self,
        model: str,
@ -417,8 +607,8 @@ class AnthropicChatCompletion(BaseLLM):
        completion_tokens = completion_response["usage"]["output_tokens"]
        total_tokens = prompt_tokens + completion_tokens

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic_text.py
@ -1,15 +1,19 @@
-import os, types
 import json
-from enum import Enum
-import requests
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
-import litellm
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
 import httpx
-from .base import BaseLLM
+import requests
+
+import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class AnthropicConstants(Enum):
@ -117,9 +121,9 @@ class AnthropicTextCompletion(BaseLLM):
            )
        else:
            if len(completion_response["completion"]) > 0:
-                model_response["choices"][0]["message"]["content"] = (
-                    completion_response["completion"]
-                )
+                model_response.choices[0].message.content = completion_response[  # type: ignore
+                    "completion"
+                ]
            model_response.choices[0].finish_reason = completion_response["stop_reason"]

        ## CALCULATING USAGE
@ -130,8 +134,8 @@ class AnthropicTextCompletion(BaseLLM):
            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
        )  ##[TODO] use the anthropic tokenizer here

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/baseten.py
+++ b/litellm/llms/baseten.py
@ -1,9 +1,11 @@
-import os
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+from enum import Enum
 from typing import Callable
+
+import requests  # type: ignore
+
 from litellm.utils import ModelResponse, Usage


@ -106,28 +108,32 @@ def completion(
                    and "data" in completion_response["model_output"]
                    and isinstance(completion_response["model_output"]["data"], list)
                ):
-                    model_response["choices"][0]["message"]["content"] = (
-                        completion_response["model_output"]["data"][0]
-                    )
+                    model_response.choices[0].message.content = completion_response[  # type: ignore
+                        "model_output"
+                    ][
+                        "data"
+                    ][
+                        0
+                    ]
                elif isinstance(completion_response["model_output"], str):
-                    model_response["choices"][0]["message"]["content"] = (
-                        completion_response["model_output"]
-                    )
+                    model_response.choices[0].message.content = completion_response[  # type: ignore
+                        "model_output"
+                    ]
            elif "completion" in completion_response and isinstance(
                completion_response["completion"], str
            ):
-                model_response["choices"][0]["message"]["content"] = (
-                    completion_response["completion"]
-                )
+                model_response.choices[0].message.content = completion_response[  # type: ignore
+                    "completion"
+                ]
            elif isinstance(completion_response, list) and len(completion_response) > 0:
                if "generated_text" not in completion_response:
                    raise BasetenError(
                        message=f"Unable to parse response. Original response: {response.text}",
                        status_code=response.status_code,
                    )
-                model_response["choices"][0]["message"]["content"] = (
-                    completion_response[0]["generated_text"]
-                )
+                model_response.choices[0].message.content = completion_response[0][  # type: ignore
+                    "generated_text"
+                ]
                ## GETTING LOGPROBS
                if (
                    "details" in completion_response[0]
@ -139,7 +145,7 @@ def completion(
                    sum_logprob = 0
                    for token in completion_response[0]["details"]["tokens"]:
                        sum_logprob += token["logprob"]
-                    model_response["choices"][0]["message"]._logprobs = sum_logprob
+                    model_response.choices[0].logprobs = sum_logprob
            else:
                raise BasetenError(
                    message=f"Unable to parse response. Original response: {response.text}",
@ -152,8 +158,8 @@ def completion(
            encoding.encode(model_response["choices"][0]["message"]["content"])
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -1122,7 +1122,7 @@ def completion(
                            logging_obj=logging_obj,
                        )

-                model_response["finish_reason"] = map_finish_reason(
+                model_response.choices[0].finish_reason = map_finish_reason(
                    response_body["stop_reason"]
                )
                _usage = litellm.Usage(
@ -1134,14 +1134,16 @@ def completion(
                setattr(model_response, "usage", _usage)
            else:
                outputText = response_body["completion"]
-                model_response["finish_reason"] = response_body["stop_reason"]
+                model_response.choices[0].finish_reason = response_body["stop_reason"]
        elif provider == "cohere":
            outputText = response_body["generations"][0]["text"]
        elif provider == "meta":
            outputText = response_body["generation"]
        elif provider == "mistral":
            outputText = response_body["outputs"][0]["text"]
-            model_response["finish_reason"] = response_body["outputs"][0]["stop_reason"]
+            model_response.choices[0].finish_reason = response_body["outputs"][0][
+                "stop_reason"
+            ]
        else:  # amazon titan
            outputText = response_body.get("results")[0].get("outputText")

@ -1160,7 +1162,7 @@ def completion(
                    and getattr(model_response.choices[0].message, "tool_calls", None)
                    is None
                ):
-                    model_response["choices"][0]["message"]["content"] = outputText
+                    model_response.choices[0].message.content = outputText
                elif (
                    hasattr(model_response.choices[0], "message")
                    and getattr(model_response.choices[0].message, "tool_calls", None)
@ -1199,8 +1201,8 @@ def completion(
            )
            setattr(model_response, "usage", usage)

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model

        model_response._hidden_params["region_name"] = client.meta.region_name
        print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
@ -1323,9 +1325,9 @@ def _embedding_func_single(
 def embedding(
    model: str,
    input: Union[list, str],
+    model_response: litellm.EmbeddingResponse,
    api_key: Optional[str] = None,
    logging_obj=None,
-    model_response=None,
    optional_params=None,
    encoding=None,
 ):
@ -1391,9 +1393,9 @@ def embedding(
                "embedding": embedding,
            }
        )
-    model_response["object"] = "list"
-    model_response["data"] = embedding_response
-    model_response["model"] = model
+    model_response.object = "list"
+    model_response.data = embedding_response
+    model_response.model = model
    input_tokens = 0

    input_str = "".join(input)
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -521,7 +521,7 @@ class BedrockLLM(BaseLLM):
                    outputText = completion_response["text"]  # type: ignore
                elif "generations" in completion_response:
                    outputText = completion_response["generations"][0]["text"]
-                    model_response["finish_reason"] = map_finish_reason(
+                    model_response.choices[0].finish_reason = map_finish_reason(
                        completion_response["generations"][0]["finish_reason"]
                    )
            elif provider == "anthropic":
@ -625,7 +625,7 @@ class BedrockLLM(BaseLLM):
                                logging_obj=logging_obj,
                            )

-                    model_response["finish_reason"] = map_finish_reason(
+                    model_response.choices[0].finish_reason = map_finish_reason(
                        completion_response.get("stop_reason", "")
                    )
                    _usage = litellm.Usage(
@ -638,7 +638,9 @@ class BedrockLLM(BaseLLM):
                else:
                    outputText = completion_response["completion"]

-                    model_response["finish_reason"] = completion_response["stop_reason"]
+                    model_response.choices[0].finish_reason = completion_response[
+                        "stop_reason"
+                    ]
            elif provider == "ai21":
                outputText = (
                    completion_response.get("completions")[0].get("data").get("text")
@ -647,9 +649,9 @@ class BedrockLLM(BaseLLM):
                outputText = completion_response["generation"]
            elif provider == "mistral":
                outputText = completion_response["outputs"][0]["text"]
-                model_response["finish_reason"] = completion_response["outputs"][0][
-                    "stop_reason"
-                ]
+                model_response.choices[0].finish_reason = completion_response[
+                    "outputs"
+                ][0]["stop_reason"]
            else:  # amazon titan
                outputText = completion_response.get("results")[0].get("outputText")
        except Exception as e:
@ -667,7 +669,7 @@ class BedrockLLM(BaseLLM):
                and getattr(model_response.choices[0].message, "tool_calls", None)
                is None
            ):
-                model_response["choices"][0]["message"]["content"] = outputText
+                model_response.choices[0].message.content = outputText
            elif (
                hasattr(model_response.choices[0], "message")
                and getattr(model_response.choices[0].message, "tool_calls", None)
@ -723,8 +725,8 @@ class BedrockLLM(BaseLLM):
            )
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
@ -1066,7 +1068,7 @@ class BedrockLLM(BaseLLM):

            if response.status_code != 200:
                raise BedrockError(
-                    status_code=response.status_code, message=response.text
+                    status_code=response.status_code, message=response.read()
                )

            decoder = AWSEventStreamDecoder(model=model)
@ -1446,8 +1448,8 @@ class BedrockConverseLLM(BaseLLM):
                message=litellm.Message(**chat_completion_message),
            )
        ]
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=input_tokens,
            completion_tokens=output_tokens,
--- a/litellm/llms/clarifai.py
+++ b/litellm/llms/clarifai.py
@ -1,13 +1,18 @@
-import os, types, traceback
 import json
-import requests
+import os
 import time
+import traceback
+import types
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Usage, Choices, Message, CustomStreamWrapper
-import litellm
+
 import httpx
+import requests
+
+import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
-from .prompt_templates.factory import prompt_factory, custom_prompt
+from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class ClarifaiError(Exception):
@ -87,7 +92,14 @@ def completions_to_model(payload):


 def process_response(
-    model, prompt, response, model_response, api_key, data, encoding, logging_obj
+    model,
+    prompt,
+    response,
+    model_response: litellm.ModelResponse,
+    api_key,
+    data,
+    encoding,
+    logging_obj,
 ):
    logging_obj.post_call(
        input=prompt,
@ -116,7 +128,7 @@ def process_response(
                message=message_obj,
            )
            choices_list.append(choice_obj)
-        model_response["choices"] = choices_list
+        model_response.choices = choices_list  # type: ignore

    except Exception as e:
        raise ClarifaiError(
@ -128,11 +140,15 @@ def process_response(
    completion_tokens = len(
        encoding.encode(model_response["choices"][0]["message"].get("content"))
    )
-    model_response["model"] = model
-    model_response["usage"] = Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
+    model_response.model = model
+    setattr(
+        model_response,
+        "usage",
+        Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
    )
    return model_response

@ -202,7 +218,7 @@ async def async_completion(
                message=message_obj,
            )
            choices_list.append(choice_obj)
-        model_response["choices"] = choices_list
+        model_response.choices = choices_list  # type: ignore

    except Exception as e:
        raise ClarifaiError(
@ -214,11 +230,15 @@ async def async_completion(
    completion_tokens = len(
        encoding.encode(model_response["choices"][0]["message"].get("content"))
    )
-    model_response["model"] = model
-    model_response["usage"] = Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
+    model_response.model = model
+    setattr(
+        model_response,
+        "usage",
+        Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
    )
    return model_response

--- a/litellm/llms/cloudflare.py
+++ b/litellm/llms/cloudflare.py
@ -1,13 +1,17 @@
-import os, types
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
-import litellm
+
 import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
 from litellm.utils import ModelResponse, Usage
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class CloudflareError(Exception):
@ -147,9 +151,9 @@ def completion(
            )
        completion_response = response.json()

-        model_response["choices"][0]["message"]["content"] = completion_response[
-            "result"
-        ]["response"]
+        model_response.choices[0].message.content = completion_response["result"][  # type: ignore
+            "response"
+        ]

        ## CALCULATING USAGE
        print_verbose(
@ -160,8 +164,8 @@ def completion(
            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = "cloudflare/" + model
+        model_response.created = int(time.time())
+        model_response.model = "cloudflare/" + model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -1,12 +1,16 @@
-import os, types
 import json
+import os
+import time
+import traceback
+import types
 from enum import Enum
-import requests  # type: ignore
-import time, traceback
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Choices, Message, Usage
-import litellm
+
 import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.utils import Choices, Message, ModelResponse, Usage


 class CohereError(Exception):
@ -117,7 +121,7 @@ class CohereConfig:

 def validate_environment(api_key):
    headers = {
-        "Request-Source":"unspecified:litellm",
+        "Request-Source": "unspecified:litellm",
        "accept": "application/json",
        "content-type": "application/json",
    }
@ -219,7 +223,7 @@ def completion(
                        message=message_obj,
                    )
                    choices_list.append(choice_obj)
-                model_response["choices"] = choices_list
+                model_response.choices = choices_list  # type: ignore
            except Exception as e:
                raise CohereError(
                    message=response.text, status_code=response.status_code
@ -231,8 +235,8 @@ def completion(
            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
@ -245,9 +249,9 @@ def completion(
 def embedding(
    model: str,
    input: list,
+    model_response: litellm.EmbeddingResponse,
    api_key: Optional[str] = None,
    logging_obj=None,
-    model_response=None,
    encoding=None,
    optional_params=None,
 ):
@ -294,14 +298,18 @@ def embedding(
        output_data.append(
            {"object": "embedding", "index": idx, "embedding": embedding}
        )
-    model_response["object"] = "list"
-    model_response["data"] = output_data
-    model_response["model"] = model
+    model_response.object = "list"
+    model_response.data = output_data
+    model_response.model = model
    input_tokens = 0
    for text in input:
        input_tokens += len(encoding.encode(text))

-    model_response["usage"] = Usage(
-        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+    setattr(
+        model_response,
+        "usage",
+        Usage(
+            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+        ),
    )
    return model_response
--- a/litellm/llms/cohere_chat.py
+++ b/litellm/llms/cohere_chat.py
@ -305,8 +305,8 @@ def completion(
        prompt_tokens = billed_units.get("input_tokens", 0)
        completion_tokens = billed_units.get("output_tokens", 0)

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -1,26 +1,26 @@
 # What is this?
 ## Handler file for databricks API https://docs.databricks.com/en/machine-learning/foundation-models/api-reference.html#chat-request
-from functools import partial
-import os, types
+import copy
 import json
-from enum import Enum
-import requests, copy  # type: ignore
+import os
 import time
-from typing import Callable, Optional, List, Union, Tuple, Literal
-from litellm.utils import (
-    ModelResponse,
-    Usage,
-    CustomStreamWrapper,
-    EmbeddingResponse,
-)
-from litellm.litellm_core_utils.core_helpers import map_finish_reason
-import litellm
-from .prompt_templates.factory import prompt_factory, custom_prompt
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from .base import BaseLLM
+import types
+from enum import Enum
+from functools import partial
+from typing import Callable, List, Literal, Optional, Tuple, Union
+
 import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.databricks import GenericStreamingChunk
 from litellm.types.utils import ProviderField
+from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class DatabricksError(Exception):
@ -354,8 +354,8 @@ class DatabricksChatCompletion(BaseLLM):
        completion_tokens = completion_response["usage"]["output_tokens"]
        total_tokens = prompt_tokens + completion_tokens

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -1,7 +1,7 @@
-####################################
-######### DEPRECATED FILE ##########
-####################################
-# logic moved to `vertex_httpx.py` #
+# ####################################
+# ######### DEPRECATED FILE ##########
+# ####################################
+# # logic moved to `vertex_httpx.py` #

 import copy
 import time
@ -92,332 +92,332 @@ class GeminiConfig:
        }


-class TextStreamer:
-    """
-    A class designed to return an async stream from AsyncGenerateContentResponse object.
-    """
+# class TextStreamer:
+#     """
+#     A class designed to return an async stream from AsyncGenerateContentResponse object.
+#     """

-    def __init__(self, response):
-        self.response = response
-        self._aiter = self.response.__aiter__()
+#     def __init__(self, response):
+#         self.response = response
+#         self._aiter = self.response.__aiter__()

-    async def __aiter__(self):
-        while True:
-            try:
-                # This will manually advance the async iterator.
-                # In the case the next object doesn't exists, __anext__() will simply raise a StopAsyncIteration exception
-                next_object = await self._aiter.__anext__()
-                yield next_object
-            except StopAsyncIteration:
-                # After getting all items from the async iterator, stop iterating
-                break
+#     async def __aiter__(self):
+#         while True:
+#             try:
+#                 # This will manually advance the async iterator.
+#                 # In the case the next object doesn't exists, __anext__() will simply raise a StopAsyncIteration exception
+#                 next_object = await self._aiter.__anext__()
+#                 yield next_object
+#             except StopAsyncIteration:
+#                 # After getting all items from the async iterator, stop iterating
+#                 break


-def supports_system_instruction():
-    import google.generativeai as genai
+# def supports_system_instruction():
+#     import google.generativeai as genai

-    gemini_pkg_version = Version(genai.__version__)
-    return gemini_pkg_version >= Version("0.5.0")
+#     gemini_pkg_version = Version(genai.__version__)
+#     return gemini_pkg_version >= Version("0.5.0")


-def completion(
-    model: str,
-    messages: list,
-    model_response: ModelResponse,
-    print_verbose: Callable,
-    api_key,
-    encoding,
-    logging_obj,
-    custom_prompt_dict: dict,
-    acompletion: bool = False,
-    optional_params=None,
-    litellm_params=None,
-    logger_fn=None,
-):
-    try:
-        import google.generativeai as genai  # type: ignore
-    except:
-        raise Exception(
-            "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
-        )
-    genai.configure(api_key=api_key)
-    system_prompt = ""
-    if model in custom_prompt_dict:
-        # check if the model has a registered custom prompt
-        model_prompt_details = custom_prompt_dict[model]
-        prompt = custom_prompt(
-            role_dict=model_prompt_details["roles"],
-            initial_prompt_value=model_prompt_details["initial_prompt_value"],
-            final_prompt_value=model_prompt_details["final_prompt_value"],
-            messages=messages,
-        )
-    else:
-        system_prompt, messages = get_system_prompt(messages=messages)
-        prompt = prompt_factory(
-            model=model, messages=messages, custom_llm_provider="gemini"
-        )
+# def completion(
+#     model: str,
+#     messages: list,
+#     model_response: ModelResponse,
+#     print_verbose: Callable,
+#     api_key,
+#     encoding,
+#     logging_obj,
+#     custom_prompt_dict: dict,
+#     acompletion: bool = False,
+#     optional_params=None,
+#     litellm_params=None,
+#     logger_fn=None,
+# ):
+#     try:
+#         import google.generativeai as genai  # type: ignore
+#     except:
+#         raise Exception(
+#             "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
+#         )
+#     genai.configure(api_key=api_key)
+#     system_prompt = ""
+#     if model in custom_prompt_dict:
+#         # check if the model has a registered custom prompt
+#         model_prompt_details = custom_prompt_dict[model]
+#         prompt = custom_prompt(
+#             role_dict=model_prompt_details["roles"],
+#             initial_prompt_value=model_prompt_details["initial_prompt_value"],
+#             final_prompt_value=model_prompt_details["final_prompt_value"],
+#             messages=messages,
+#         )
+#     else:
+#         system_prompt, messages = get_system_prompt(messages=messages)
+#         prompt = prompt_factory(
+#             model=model, messages=messages, custom_llm_provider="gemini"
+#         )

-    ## Load Config
-    inference_params = copy.deepcopy(optional_params)
-    stream = inference_params.pop("stream", None)
+#     ## Load Config
+#     inference_params = copy.deepcopy(optional_params)
+#     stream = inference_params.pop("stream", None)

-    # Handle safety settings
-    safety_settings_param = inference_params.pop("safety_settings", None)
-    safety_settings = None
-    if safety_settings_param:
-        safety_settings = [
-            genai.types.SafetySettingDict(x) for x in safety_settings_param
-        ]
+#     # Handle safety settings
+#     safety_settings_param = inference_params.pop("safety_settings", None)
+#     safety_settings = None
+#     if safety_settings_param:
+#         safety_settings = [
+#             genai.types.SafetySettingDict(x) for x in safety_settings_param
+#         ]

-    config = litellm.GeminiConfig.get_config()
-    for k, v in config.items():
-        if (
-            k not in inference_params
-        ):  # completion(top_k=3) > gemini_config(top_k=3) <- allows for dynamic variables to be passed in
-            inference_params[k] = v
+#     config = litellm.GeminiConfig.get_config()
+#     for k, v in config.items():
+#         if (
+#             k not in inference_params
+#         ):  # completion(top_k=3) > gemini_config(top_k=3) <- allows for dynamic variables to be passed in
+#             inference_params[k] = v

-    ## LOGGING
-    logging_obj.pre_call(
-        input=prompt,
-        api_key="",
-        additional_args={
-            "complete_input_dict": {
-                "inference_params": inference_params,
-                "system_prompt": system_prompt,
-            }
-        },
-    )
-    ## COMPLETION CALL
-    try:
-        _params = {"model_name": "models/{}".format(model)}
-        _system_instruction = supports_system_instruction()
-        if _system_instruction and len(system_prompt) > 0:
-            _params["system_instruction"] = system_prompt
-        _model = genai.GenerativeModel(**_params)
-        if stream is True:
-            if acompletion is True:
+#     ## LOGGING
+#     logging_obj.pre_call(
+#         input=prompt,
+#         api_key="",
+#         additional_args={
+#             "complete_input_dict": {
+#                 "inference_params": inference_params,
+#                 "system_prompt": system_prompt,
+#             }
+#         },
+#     )
+#     ## COMPLETION CALL
+#     try:
+#         _params = {"model_name": "models/{}".format(model)}
+#         _system_instruction = supports_system_instruction()
+#         if _system_instruction and len(system_prompt) > 0:
+#             _params["system_instruction"] = system_prompt
+#         _model = genai.GenerativeModel(**_params)
+#         if stream is True:
+#             if acompletion is True:

-                async def async_streaming():
-                    try:
-                        response = await _model.generate_content_async(
-                            contents=prompt,
-                            generation_config=genai.types.GenerationConfig(
-                                **inference_params
-                            ),
-                            safety_settings=safety_settings,
-                            stream=True,
-                        )
+#                 async def async_streaming():
+#                     try:
+#                         response = await _model.generate_content_async(
+#                             contents=prompt,
+#                             generation_config=genai.types.GenerationConfig(
+#                                 **inference_params
+#                             ),
+#                             safety_settings=safety_settings,
+#                             stream=True,
+#                         )

-                        response = litellm.CustomStreamWrapper(
-                            TextStreamer(response),
-                            model,
-                            custom_llm_provider="gemini",
-                            logging_obj=logging_obj,
-                        )
-                        return response
-                    except Exception as e:
-                        raise GeminiError(status_code=500, message=str(e))
+#                         response = litellm.CustomStreamWrapper(
+#                             TextStreamer(response),
+#                             model,
+#                             custom_llm_provider="gemini",
+#                             logging_obj=logging_obj,
+#                         )
+#                         return response
+#                     except Exception as e:
+#                         raise GeminiError(status_code=500, message=str(e))

-                return async_streaming()
-            response = _model.generate_content(
-                contents=prompt,
-                generation_config=genai.types.GenerationConfig(**inference_params),
-                safety_settings=safety_settings,
-                stream=True,
-            )
-            return response
-        elif acompletion == True:
-            return async_completion(
-                _model=_model,
-                model=model,
-                prompt=prompt,
-                inference_params=inference_params,
-                safety_settings=safety_settings,
-                logging_obj=logging_obj,
-                print_verbose=print_verbose,
-                model_response=model_response,
-                messages=messages,
-                encoding=encoding,
-            )
-        else:
-            params = {
-                "contents": prompt,
-                "generation_config": genai.types.GenerationConfig(**inference_params),
-                "safety_settings": safety_settings,
-            }
-            response = _model.generate_content(**params)
-    except Exception as e:
-        raise GeminiError(
-            message=str(e),
-            status_code=500,
-        )
+#                 return async_streaming()
+#             response = _model.generate_content(
+#                 contents=prompt,
+#                 generation_config=genai.types.GenerationConfig(**inference_params),
+#                 safety_settings=safety_settings,
+#                 stream=True,
+#             )
+#             return response
+#         elif acompletion == True:
+#             return async_completion(
+#                 _model=_model,
+#                 model=model,
+#                 prompt=prompt,
+#                 inference_params=inference_params,
+#                 safety_settings=safety_settings,
+#                 logging_obj=logging_obj,
+#                 print_verbose=print_verbose,
+#                 model_response=model_response,
+#                 messages=messages,
+#                 encoding=encoding,
+#             )
+#         else:
+#             params = {
+#                 "contents": prompt,
+#                 "generation_config": genai.types.GenerationConfig(**inference_params),
+#                 "safety_settings": safety_settings,
+#             }
+#             response = _model.generate_content(**params)
+#     except Exception as e:
+#         raise GeminiError(
+#             message=str(e),
+#             status_code=500,
+#         )

-    ## LOGGING
-    logging_obj.post_call(
-        input=prompt,
-        api_key="",
-        original_response=response,
-        additional_args={"complete_input_dict": {}},
-    )
-    print_verbose(f"raw model_response: {response}")
-    ## RESPONSE OBJECT
-    completion_response = response
-    try:
-        choices_list = []
-        for idx, item in enumerate(completion_response.candidates):
-            if len(item.content.parts) > 0:
-                message_obj = Message(content=item.content.parts[0].text)
-            else:
-                message_obj = Message(content=None)
-            choice_obj = Choices(index=idx, message=message_obj)
-            choices_list.append(choice_obj)
-        model_response["choices"] = choices_list
-    except Exception as e:
-        verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
-        verbose_logger.debug(traceback.format_exc())
-        raise GeminiError(
-            message=traceback.format_exc(), status_code=response.status_code
-        )
+#     ## LOGGING
+#     logging_obj.post_call(
+#         input=prompt,
+#         api_key="",
+#         original_response=response,
+#         additional_args={"complete_input_dict": {}},
+#     )
+#     print_verbose(f"raw model_response: {response}")
+#     ## RESPONSE OBJECT
+#     completion_response = response
+#     try:
+#         choices_list = []
+#         for idx, item in enumerate(completion_response.candidates):
+#             if len(item.content.parts) > 0:
+#                 message_obj = Message(content=item.content.parts[0].text)
+#             else:
+#                 message_obj = Message(content=None)
+#             choice_obj = Choices(index=idx, message=message_obj)
+#             choices_list.append(choice_obj)
+#         model_response.choices = choices_list
+#     except Exception as e:
+#         verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
+#         verbose_logger.debug(traceback.format_exc())
+#         raise GeminiError(
+#             message=traceback.format_exc(), status_code=response.status_code
+#         )

-    try:
-        completion_response = model_response["choices"][0]["message"].get("content")
-        if completion_response is None:
-            raise Exception
-    except:
-        original_response = f"response: {response}"
-        if hasattr(response, "candidates"):
-            original_response = f"response: {response.candidates}"
-            if "SAFETY" in original_response:
-                original_response += (
-                    "\nThe candidate content was flagged for safety reasons."
-                )
-            elif "RECITATION" in original_response:
-                original_response += (
-                    "\nThe candidate content was flagged for recitation reasons."
-                )
-        raise GeminiError(
-            status_code=400,
-            message=f"No response received. Original response - {original_response}",
-        )
+#     try:
+#         completion_response = model_response["choices"][0]["message"].get("content")
+#         if completion_response is None:
+#             raise Exception
+#     except:
+#         original_response = f"response: {response}"
+#         if hasattr(response, "candidates"):
+#             original_response = f"response: {response.candidates}"
+#             if "SAFETY" in original_response:
+#                 original_response += (
+#                     "\nThe candidate content was flagged for safety reasons."
+#                 )
+#             elif "RECITATION" in original_response:
+#                 original_response += (
+#                     "\nThe candidate content was flagged for recitation reasons."
+#                 )
+#         raise GeminiError(
+#             status_code=400,
+#             message=f"No response received. Original response - {original_response}",
+#         )

-    ## CALCULATING USAGE
-    prompt_str = ""
-    for m in messages:
-        if isinstance(m["content"], str):
-            prompt_str += m["content"]
-        elif isinstance(m["content"], list):
-            for content in m["content"]:
-                if content["type"] == "text":
-                    prompt_str += content["text"]
-    prompt_tokens = len(encoding.encode(prompt_str))
-    completion_tokens = len(
-        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-    )
+#     ## CALCULATING USAGE
+#     prompt_str = ""
+#     for m in messages:
+#         if isinstance(m["content"], str):
+#             prompt_str += m["content"]
+#         elif isinstance(m["content"], list):
+#             for content in m["content"]:
+#                 if content["type"] == "text":
+#                     prompt_str += content["text"]
+#     prompt_tokens = len(encoding.encode(prompt_str))
+#     completion_tokens = len(
+#         encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+#     )

-    model_response["created"] = int(time.time())
-    model_response["model"] = "gemini/" + model
-    usage = Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
-    )
-    setattr(model_response, "usage", usage)
-    return model_response
+#     model_response.created = int(time.time())
+#     model_response.model = "gemini/" + model
+#     usage = Usage(
+#         prompt_tokens=prompt_tokens,
+#         completion_tokens=completion_tokens,
+#         total_tokens=prompt_tokens + completion_tokens,
+#     )
+#     setattr(model_response, "usage", usage)
+#     return model_response


-async def async_completion(
-    _model,
-    model,
-    prompt,
-    inference_params,
-    safety_settings,
-    logging_obj,
-    print_verbose,
-    model_response,
-    messages,
-    encoding,
-):
-    import google.generativeai as genai  # type: ignore
+# async def async_completion(
+#     _model,
+#     model,
+#     prompt,
+#     inference_params,
+#     safety_settings,
+#     logging_obj,
+#     print_verbose,
+#     model_response,
+#     messages,
+#     encoding,
+# ):
+#     import google.generativeai as genai  # type: ignore

-    response = await _model.generate_content_async(
-        contents=prompt,
-        generation_config=genai.types.GenerationConfig(**inference_params),
-        safety_settings=safety_settings,
-    )
+#     response = await _model.generate_content_async(
+#         contents=prompt,
+#         generation_config=genai.types.GenerationConfig(**inference_params),
+#         safety_settings=safety_settings,
+#     )

-    ## LOGGING
-    logging_obj.post_call(
-        input=prompt,
-        api_key="",
-        original_response=response,
-        additional_args={"complete_input_dict": {}},
-    )
-    print_verbose(f"raw model_response: {response}")
-    ## RESPONSE OBJECT
-    completion_response = response
-    try:
-        choices_list = []
-        for idx, item in enumerate(completion_response.candidates):
-            if len(item.content.parts) > 0:
-                message_obj = Message(content=item.content.parts[0].text)
-            else:
-                message_obj = Message(content=None)
-            choice_obj = Choices(index=idx, message=message_obj)
-            choices_list.append(choice_obj)
-        model_response["choices"] = choices_list
-    except Exception as e:
-        verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
-        verbose_logger.debug(traceback.format_exc())
-        raise GeminiError(
-            message=traceback.format_exc(), status_code=response.status_code
-        )
+#     ## LOGGING
+#     logging_obj.post_call(
+#         input=prompt,
+#         api_key="",
+#         original_response=response,
+#         additional_args={"complete_input_dict": {}},
+#     )
+#     print_verbose(f"raw model_response: {response}")
+#     ## RESPONSE OBJECT
+#     completion_response = response
+#     try:
+#         choices_list = []
+#         for idx, item in enumerate(completion_response.candidates):
+#             if len(item.content.parts) > 0:
+#                 message_obj = Message(content=item.content.parts[0].text)
+#             else:
+#                 message_obj = Message(content=None)
+#             choice_obj = Choices(index=idx, message=message_obj)
+#             choices_list.append(choice_obj)
+#         model_response["choices"] = choices_list
+#     except Exception as e:
+#         verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
+#         verbose_logger.debug(traceback.format_exc())
+#         raise GeminiError(
+#             message=traceback.format_exc(), status_code=response.status_code
+#         )

-    try:
-        completion_response = model_response["choices"][0]["message"].get("content")
-        if completion_response is None:
-            raise Exception
-    except:
-        original_response = f"response: {response}"
-        if hasattr(response, "candidates"):
-            original_response = f"response: {response.candidates}"
-            if "SAFETY" in original_response:
-                original_response += (
-                    "\nThe candidate content was flagged for safety reasons."
-                )
-            elif "RECITATION" in original_response:
-                original_response += (
-                    "\nThe candidate content was flagged for recitation reasons."
-                )
-        raise GeminiError(
-            status_code=400,
-            message=f"No response received. Original response - {original_response}",
-        )
+#     try:
+#         completion_response = model_response["choices"][0]["message"].get("content")
+#         if completion_response is None:
+#             raise Exception
+#     except:
+#         original_response = f"response: {response}"
+#         if hasattr(response, "candidates"):
+#             original_response = f"response: {response.candidates}"
+#             if "SAFETY" in original_response:
+#                 original_response += (
+#                     "\nThe candidate content was flagged for safety reasons."
+#                 )
+#             elif "RECITATION" in original_response:
+#                 original_response += (
+#                     "\nThe candidate content was flagged for recitation reasons."
+#                 )
+#         raise GeminiError(
+#             status_code=400,
+#             message=f"No response received. Original response - {original_response}",
+#         )

-    ## CALCULATING USAGE
-    prompt_str = ""
-    for m in messages:
-        if isinstance(m["content"], str):
-            prompt_str += m["content"]
-        elif isinstance(m["content"], list):
-            for content in m["content"]:
-                if content["type"] == "text":
-                    prompt_str += content["text"]
-    prompt_tokens = len(encoding.encode(prompt_str))
-    completion_tokens = len(
-        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-    )
+#     ## CALCULATING USAGE
+#     prompt_str = ""
+#     for m in messages:
+#         if isinstance(m["content"], str):
+#             prompt_str += m["content"]
+#         elif isinstance(m["content"], list):
+#             for content in m["content"]:
+#                 if content["type"] == "text":
+#                     prompt_str += content["text"]
+#     prompt_tokens = len(encoding.encode(prompt_str))
+#     completion_tokens = len(
+#         encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+#     )

-    model_response["created"] = int(time.time())
-    model_response["model"] = "gemini/" + model
-    usage = Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
-    )
-    model_response.usage = usage
-    return model_response
+#     model_response["created"] = int(time.time())
+#     model_response["model"] = "gemini/" + model
+#     usage = Usage(
+#         prompt_tokens=prompt_tokens,
+#         completion_tokens=completion_tokens,
+#         total_tokens=prompt_tokens + completion_tokens,
+#     )
+#     model_response.usage = usage
+#     return model_response


-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
+# def embedding():
+#     # logic for parsing in - calling - parsing out model embedding calls
+#     pass
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -1,17 +1,22 @@
 ## Uses the huggingface text generation inference API
-import os, copy, types
-import json
-from enum import Enum
-import httpx, requests
-from .base import BaseLLM
-import time
-import litellm
-from typing import Callable, Dict, List, Any, Literal, Tuple
-from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, Usage
-from typing import Optional
-from .prompt_templates.factory import prompt_factory, custom_prompt
-from litellm.types.completion import ChatCompletionMessageToolCallParam
+import copy
 import enum
+import json
+import os
+import time
+import types
+from enum import Enum
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+
+import httpx
+import requests
+
+import litellm
+from litellm.types.completion import ChatCompletionMessageToolCallParam
+from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage
+
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class HuggingfaceError(Exception):
@ -269,7 +274,7 @@ class Huggingface(BaseLLM):
    def convert_to_model_response_object(
        self,
        completion_response,
-        model_response,
+        model_response: litellm.ModelResponse,
        task: hf_tasks,
        optional_params,
        encoding,
@ -278,11 +283,9 @@ class Huggingface(BaseLLM):
    ):
        if task == "conversational":
            if len(completion_response["generated_text"]) > 0:  # type: ignore
-                model_response["choices"][0]["message"][
-                    "content"
-                ] = completion_response[
+                model_response.choices[0].message.content = completion_response[  # type: ignore
                    "generated_text"
-                ]  # type: ignore
+                ]
        elif task == "text-generation-inference":
            if (
                not isinstance(completion_response, list)
@ -295,7 +298,7 @@ class Huggingface(BaseLLM):
                )

            if len(completion_response[0]["generated_text"]) > 0:
-                model_response["choices"][0]["message"]["content"] = output_parser(
+                model_response.choices[0].message.content = output_parser(  # type: ignore
                    completion_response[0]["generated_text"]
                )
            ## GETTING LOGPROBS + FINISH REASON
@ -310,7 +313,7 @@ class Huggingface(BaseLLM):
                for token in completion_response[0]["details"]["tokens"]:
                    if token["logprob"] != None:
                        sum_logprob += token["logprob"]
-                model_response["choices"][0]["message"]._logprob = sum_logprob
+                setattr(model_response.choices[0].message, "_logprob", sum_logprob)  # type: ignore
            if "best_of" in optional_params and optional_params["best_of"] > 1:
                if (
                    "details" in completion_response[0]
@ -337,14 +340,14 @@ class Huggingface(BaseLLM):
                            message=message_obj,
                        )
                        choices_list.append(choice_obj)
-                    model_response["choices"].extend(choices_list)
+                    model_response.choices.extend(choices_list)
        elif task == "text-classification":
-            model_response["choices"][0]["message"]["content"] = json.dumps(
+            model_response.choices[0].message.content = json.dumps(  # type: ignore
                completion_response
            )
        else:
            if len(completion_response[0]["generated_text"]) > 0:
-                model_response["choices"][0]["message"]["content"] = output_parser(
+                model_response.choices[0].message.content = output_parser(  # type: ignore
                    completion_response[0]["generated_text"]
                )
        ## CALCULATING USAGE
@ -371,14 +374,14 @@ class Huggingface(BaseLLM):
        else:
            completion_tokens = 0

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        )
-        model_response.usage = usage
+        setattr(model_response, "usage", usage)
        model_response._hidden_params["original_response"] = completion_response
        return model_response

@ -763,10 +766,10 @@ class Huggingface(BaseLLM):
        self,
        model: str,
        input: list,
+        model_response: litellm.EmbeddingResponse,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        logging_obj=None,
-        model_response=None,
        encoding=None,
    ):
        super().embedding()
@ -867,15 +870,21 @@ class Huggingface(BaseLLM):
                            ],  # flatten list returned from hf
                        }
                    )
-        model_response["object"] = "list"
-        model_response["data"] = output_data
-        model_response["model"] = model
+        model_response.object = "list"
+        model_response.data = output_data
+        model_response.model = model
        input_tokens = 0
        for text in input:
            input_tokens += len(encoding.encode(text))

-        model_response["usage"] = {
-            "prompt_tokens": input_tokens,
-            "total_tokens": input_tokens,
-        }
+        setattr(
+            model_response,
+            "usage",
+            litellm.Usage(
+                **{
+                    "prompt_tokens": input_tokens,
+                    "total_tokens": input_tokens,
+                }
+            ),
+        )
        return model_response
--- a/litellm/llms/maritalk.py
+++ b/litellm/llms/maritalk.py
@ -1,11 +1,15 @@
-import os, types
 import json
+import os
+import time
+import traceback
+import types
 from enum import Enum
+from typing import Callable, List, Optional
+
 import requests  # type: ignore
-import time, traceback
-from typing import Callable, Optional, List
-from litellm.utils import ModelResponse, Choices, Message, Usage
+
 import litellm
+from litellm.utils import Choices, Message, ModelResponse, Usage


 class MaritalkError(Exception):
@ -152,9 +156,9 @@ def completion(
        else:
            try:
                if len(completion_response["answer"]) > 0:
-                    model_response["choices"][0]["message"]["content"] = (
-                        completion_response["answer"]
-                    )
+                    model_response.choices[0].message.content = completion_response[  # type: ignore
+                        "answer"
+                    ]
            except Exception as e:
                raise MaritalkError(
                    message=response.text, status_code=response.status_code
@ -167,8 +171,8 @@ def completion(
            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@ -1,9 +1,12 @@
-import os, types
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
+
+import requests  # type: ignore
+
 import litellm
 from litellm.utils import ModelResponse, Usage

@ -185,7 +188,7 @@ def completion(
        else:
            try:
                if len(completion_response["generated_text"]) > 0:
-                    model_response["choices"][0]["message"]["content"] = (
+                    model_response.choices[0].message.content = (  # type: ignore
                        completion_response["generated_text"]
                    )
            except:
@ -198,8 +201,8 @@ def completion(
        prompt_tokens = completion_response["nb_input_tokens"]
        completion_tokens = completion_response["nb_generated_tokens"]

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -1,13 +1,21 @@
-from itertools import chain
-import requests, types, time  # type: ignore
-import json, uuid
+import asyncio
+import json
+import time
 import traceback
-from typing import Optional, List
+import types
+import uuid
+from itertools import chain
+from typing import List, Optional
+
+import aiohttp
+import httpx  # type: ignore
+import requests  # type: ignore
+
 import litellm
-from litellm.types.utils import ProviderField
-import httpx, aiohttp, asyncio  # type: ignore
-from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm import verbose_logger
+from litellm.types.utils import ProviderField
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class OllamaError(Exception):
@ -138,7 +146,6 @@ class OllamaConfig:
            )
        ]

-
    def get_supported_openai_params(
        self,
    ):
@ -157,7 +164,8 @@ class OllamaConfig:
 # ollama wants plain base64 jpeg/png files as images.  strip any leading dataURI
 # and convert to jpeg if necessary.
 def _convert_image(image):
-    import base64, io
+    import base64
+    import io

    try:
        from PIL import Image
@ -183,13 +191,13 @@ def _convert_image(image):

 # ollama implementation
 def get_ollama_response(
+    model_response: litellm.ModelResponse,
    api_base="http://localhost:11434",
    model="llama2",
    prompt="Why is the sky blue?",
    optional_params=None,
    logging_obj=None,
    acompletion: bool = False,
-    model_response=None,
    encoding=None,
 ):
    if api_base.endswith("/api/generate"):
@ -271,7 +279,7 @@ def get_ollama_response(
    response_json = response.json()

    ## RESPONSE OBJECT
-    model_response["choices"][0]["finish_reason"] = "stop"
+    model_response.choices[0].finish_reason = "stop"
    if data.get("format", "") == "json":
        function_call = json.loads(response_json["response"])
        message = litellm.Message(
@ -287,20 +295,24 @@ def get_ollama_response(
                }
            ],
        )
-        model_response["choices"][0]["message"] = message
-        model_response["choices"][0]["finish_reason"] = "tool_calls"
+        model_response.choices[0].message = message  # type: ignore
+        model_response.choices[0].finish_reason = "tool_calls"
    else:
-        model_response["choices"][0]["message"]["content"] = response_json["response"]
-    model_response["created"] = int(time.time())
-    model_response["model"] = "ollama/" + model
+        model_response.choices[0].message.content = response_json["response"]  # type: ignore
+    model_response.created = int(time.time())
+    model_response.model = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=())))  # type: ignore
    completion_tokens = response_json.get(
        "eval_count", len(response_json.get("message", dict()).get("content", ""))
    )
-    model_response["usage"] = litellm.Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
+    setattr(
+        model_response,
+        "usage",
+        litellm.Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
    )
    return model_response

@ -346,8 +358,8 @@ def ollama_completion_stream(url, data, logging_obj):
                    ],
                )
                model_response = first_chunk
-                model_response["choices"][0]["delta"] = delta
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].delta = delta  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
                yield model_response
            else:
                for transformed_chunk in streamwrapper:
@ -401,8 +413,8 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                    ],
                )
                model_response = first_chunk
-                model_response["choices"][0]["delta"] = delta
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].delta = delta  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
                yield model_response
            else:
                async for transformed_chunk in streamwrapper:
@ -418,7 +430,9 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
        raise e


-async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
+async def ollama_acompletion(
+    url, data, model_response: litellm.ModelResponse, encoding, logging_obj
+):
    data["stream"] = False
    try:
        timeout = aiohttp.ClientTimeout(total=litellm.request_timeout)  # 10 minutes
@ -442,7 +456,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):

            response_json = await resp.json()
            ## RESPONSE OBJECT
-            model_response["choices"][0]["finish_reason"] = "stop"
+            model_response.choices[0].finish_reason = "stop"
            if data.get("format", "") == "json":
                function_call = json.loads(response_json["response"])
                message = litellm.Message(
@ -451,30 +465,34 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                        {
                            "id": f"call_{str(uuid.uuid4())}",
                            "function": {
-                                "name": function_call.get("name", function_call.get("function", None)),
+                                "name": function_call.get(
+                                    "name", function_call.get("function", None)
+                                ),
                                "arguments": json.dumps(function_call["arguments"]),
                            },
                            "type": "function",
                        }
                    ],
                )
-                model_response["choices"][0]["message"] = message
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].message = message  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
            else:
-                model_response["choices"][0]["message"]["content"] = response_json[
-                    "response"
-                ]
-            model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
+                model_response.choices[0].message.content = response_json["response"]  # type: ignore
+            model_response.created = int(time.time())
+            model_response.model = "ollama/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=())))  # type: ignore
            completion_tokens = response_json.get(
                "eval_count",
                len(response_json.get("message", dict()).get("content", "")),
            )
-            model_response["usage"] = litellm.Usage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=prompt_tokens + completion_tokens,
+            setattr(
+                model_response,
+                "usage",
+                litellm.Usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=prompt_tokens + completion_tokens,
+                ),
            )
            return model_response
    except Exception as e:
@ -491,9 +509,9 @@ async def ollama_aembeddings(
    api_base: str,
    model: str,
    prompts: list,
+    model_response: litellm.EmbeddingResponse,
    optional_params=None,
    logging_obj=None,
-    model_response=None,
    encoding=None,
 ):
    if api_base.endswith("/api/embeddings"):
@ -554,13 +572,19 @@ async def ollama_aembeddings(
            input_tokens = len(encoding.encode(prompt))
            total_input_tokens += input_tokens

-    model_response["object"] = "list"
-    model_response["data"] = output_data
-    model_response["model"] = model
-    model_response["usage"] = {
-        "prompt_tokens": total_input_tokens,
-        "total_tokens": total_input_tokens,
-    }
+    model_response.object = "list"
+    model_response.data = output_data
+    model_response.model = model
+    setattr(
+        model_response,
+        "usage",
+        litellm.Usage(
+            **{
+                "prompt_tokens": total_input_tokens,
+                "total_tokens": total_input_tokens,
+            }
+        ),
+    )
    return model_response


--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -1,15 +1,17 @@
-from itertools import chain
-import requests
-import types
-import time
 import json
-import uuid
+import time
 import traceback
+import types
+import uuid
+from itertools import chain
 from typing import Optional
-from litellm import verbose_logger
-import litellm
-import httpx
+
 import aiohttp
+import httpx
+import requests
+
+import litellm
+from litellm import verbose_logger


 class OllamaError(Exception):
@ -195,6 +197,7 @@ class OllamaChatConfig:

 # ollama implementation
 def get_ollama_response(
+    model_response: litellm.ModelResponse,
    api_base="http://localhost:11434",
    api_key: Optional[str] = None,
    model="llama2",
@ -202,7 +205,6 @@ def get_ollama_response(
    optional_params=None,
    logging_obj=None,
    acompletion: bool = False,
-    model_response=None,
    encoding=None,
 ):
    if api_base.endswith("/api/chat"):
@ -295,7 +297,7 @@ def get_ollama_response(
    response_json = response.json()

    ## RESPONSE OBJECT
-    model_response["choices"][0]["finish_reason"] = "stop"
+    model_response.choices[0].finish_reason = "stop"
    if data.get("format", "") == "json":
        function_call = json.loads(response_json["message"]["content"])
        message = litellm.Message(
@ -311,22 +313,24 @@ def get_ollama_response(
                }
            ],
        )
-        model_response["choices"][0]["message"] = message
-        model_response["choices"][0]["finish_reason"] = "tool_calls"
+        model_response.choices[0].message = message  # type: ignore
+        model_response.choices[0].finish_reason = "tool_calls"
    else:
-        model_response["choices"][0]["message"]["content"] = response_json["message"][
-            "content"
-        ]
-    model_response["created"] = int(time.time())
-    model_response["model"] = "ollama/" + model
+        model_response.choices[0].message.content = response_json["message"]["content"]  # type: ignore
+    model_response.created = int(time.time())
+    model_response.model = "ollama/" + model
    prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages))  # type: ignore
    completion_tokens = response_json.get(
        "eval_count", litellm.token_counter(text=response_json["message"]["content"])
    )
-    model_response["usage"] = litellm.Usage(
-        prompt_tokens=prompt_tokens,
-        completion_tokens=completion_tokens,
-        total_tokens=prompt_tokens + completion_tokens,
+    setattr(
+        model_response,
+        "usage",
+        litellm.Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
    )
    return model_response

@ -379,8 +383,8 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
                    ],
                )
                model_response = first_chunk
-                model_response["choices"][0]["delta"] = delta
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].delta = delta  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
                yield model_response
            else:
                for transformed_chunk in streamwrapper:
@ -434,7 +438,9 @@ async def ollama_async_streaming(
                        {
                            "id": f"call_{str(uuid.uuid4())}",
                            "function": {
-                                "name": function_call.get("name", function_call.get("function", None)),
+                                "name": function_call.get(
+                                    "name", function_call.get("function", None)
+                                ),
                                "arguments": json.dumps(function_call["arguments"]),
                            },
                            "type": "function",
@ -442,8 +448,8 @@ async def ollama_async_streaming(
                    ],
                )
                model_response = first_chunk
-                model_response["choices"][0]["delta"] = delta
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].delta = delta  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
                yield model_response
            else:
                async for transformed_chunk in streamwrapper:
@ -457,7 +463,7 @@ async def ollama_acompletion(
    url,
    api_key: Optional[str],
    data,
-    model_response,
+    model_response: litellm.ModelResponse,
    encoding,
    logging_obj,
    function_name,
@ -492,7 +498,7 @@ async def ollama_acompletion(
            )

            ## RESPONSE OBJECT
-            model_response["choices"][0]["finish_reason"] = "stop"
+            model_response.choices[0].finish_reason = "stop"
            if data.get("format", "") == "json":
                function_call = json.loads(response_json["message"]["content"])
                message = litellm.Message(
@ -510,15 +516,17 @@ async def ollama_acompletion(
                        }
                    ],
                )
-                model_response["choices"][0]["message"] = message
-                model_response["choices"][0]["finish_reason"] = "tool_calls"
+                model_response.choices[0].message = message  # type: ignore
+                model_response.choices[0].finish_reason = "tool_calls"
            else:
-                model_response["choices"][0]["message"]["content"] = response_json[
+                model_response.choices[0].message.content = response_json[  # type: ignore
                    "message"
-                ]["content"]
+                ][
+                    "content"
+                ]

-            model_response["created"] = int(time.time())
-            model_response["model"] = "ollama_chat/" + data["model"]
+            model_response.created = int(time.time())
+            model_response.model = "ollama_chat/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
            completion_tokens = response_json.get(
                "eval_count",
@ -526,10 +534,14 @@ async def ollama_acompletion(
                    text=response_json["message"]["content"], count_response_tokens=True
                ),
            )
-            model_response["usage"] = litellm.Usage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=prompt_tokens + completion_tokens,
+            setattr(
+                model_response,
+                "usage",
+                litellm.Usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=prompt_tokens + completion_tokens,
+                ),
            )
            return model_response
    except Exception as e:
--- a/litellm/llms/oobabooga.py
+++ b/litellm/llms/oobabooga.py
@ -1,11 +1,14 @@
-import os
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+from enum import Enum
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Usage
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
+import requests  # type: ignore
+
+from litellm.utils import EmbeddingResponse, ModelResponse, Usage
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class OobaboogaError(Exception):
@ -99,17 +102,15 @@ def completion(
            )
        else:
            try:
-                model_response["choices"][0]["message"]["content"] = (
-                    completion_response["choices"][0]["message"]["content"]
-                )
+                model_response.choices[0].message.content = completion_response["choices"][0]["message"]["content"]  # type: ignore
            except:
                raise OobaboogaError(
                    message=json.dumps(completion_response),
                    status_code=response.status_code,
                )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=completion_response["usage"]["prompt_tokens"],
            completion_tokens=completion_response["usage"]["completion_tokens"],
@ -122,10 +123,10 @@ def completion(
 def embedding(
    model: str,
    input: list,
+    model_response: EmbeddingResponse,
    api_key: Optional[str] = None,
    api_base: Optional[str] = None,
    logging_obj=None,
-    model_response=None,
    optional_params=None,
    encoding=None,
 ):
@ -166,7 +167,7 @@ def embedding(
        )

    # Process response data
-    model_response["data"] = [
+    model_response.data = [
        {
            "embedding": completion_response["data"][0]["embedding"],
            "index": 0,
@ -176,8 +177,12 @@ def embedding(

    num_tokens = len(completion_response["data"][0]["embedding"])
    # Adding metadata to response
-    model_response.usage = Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
-    model_response["object"] = "list"
-    model_response["model"] = model
+    setattr(
+        model_response,
+        "usage",
+        Usage(prompt_tokens=num_tokens, total_tokens=num_tokens),
+    )
+    model_response.object = "list"
+    model_response.model = model

    return model_response
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -18,6 +18,7 @@ import httpx
 import openai
 from openai import AsyncOpenAI, OpenAI
 from openai.types.beta.assistant_deleted import AssistantDeleted
+from openai.types.file_deleted import FileDeleted
 from pydantic import BaseModel
 from typing_extensions import overload, override

@ -2064,6 +2065,151 @@ class OpenAIFilesAPI(BaseLLM):

        return response

+    async def aretrieve_file(
+        self,
+        file_id: str,
+        openai_client: AsyncOpenAI,
+    ) -> FileObject:
+        response = await openai_client.files.retrieve(file_id=file_id)
+        return response
+
+    def retrieve_file(
+        self,
+        _is_async: bool,
+        file_id: str,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.aretrieve_file(  # type: ignore
+                file_id=file_id,
+                openai_client=openai_client,
+            )
+        response = openai_client.files.retrieve(file_id=file_id)
+
+        return response
+
+    async def adelete_file(
+        self,
+        file_id: str,
+        openai_client: AsyncOpenAI,
+    ) -> FileDeleted:
+        response = await openai_client.files.delete(file_id=file_id)
+        return response
+
+    def delete_file(
+        self,
+        _is_async: bool,
+        file_id: str,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.adelete_file(  # type: ignore
+                file_id=file_id,
+                openai_client=openai_client,
+            )
+        response = openai_client.files.delete(file_id=file_id)
+
+        return response
+
+    async def alist_files(
+        self,
+        openai_client: AsyncOpenAI,
+        purpose: Optional[str] = None,
+    ):
+        if isinstance(purpose, str):
+            response = await openai_client.files.list(purpose=purpose)
+        else:
+            response = await openai_client.files.list()
+        return response
+
+    def list_files(
+        self,
+        _is_async: bool,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        purpose: Optional[str] = None,
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.alist_files(  # type: ignore
+                purpose=purpose,
+                openai_client=openai_client,
+            )
+
+        if isinstance(purpose, str):
+            response = openai_client.files.list(purpose=purpose)
+        else:
+            response = openai_client.files.list()
+
+        return response
+

 class OpenAIBatchesAPI(BaseLLM):
    """
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -1,12 +1,14 @@
-import types
-import traceback
 import copy
 import time
+import traceback
+import types
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Choices, Message, Usage
-import litellm
+
 import httpx
+
+import litellm
 from litellm import verbose_logger
+from litellm.utils import Choices, Message, ModelResponse, Usage


 class PalmError(Exception):
@ -164,7 +166,7 @@ def completion(
                message_obj = Message(content=None)
            choice_obj = Choices(index=idx + 1, message=message_obj)
            choices_list.append(choice_obj)
-        model_response["choices"] = choices_list
+        model_response.choices = choices_list  # type: ignore
    except Exception as e:
        verbose_logger.error(
            "litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
@ -188,8 +190,8 @@ def completion(
        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
    )

-    model_response["created"] = int(time.time())
-    model_response["model"] = "palm/" + model
+    model_response.created = int(time.time())
+    model_response.model = "palm/" + model
    usage = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@ -1,12 +1,16 @@
-import os, types
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
+
+import requests  # type: ignore
+
 import litellm
 from litellm.utils import ModelResponse, Usage
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class PetalsError(Exception):
@ -151,8 +155,8 @@ def completion(
    else:
        try:
            import torch
-            from transformers import AutoTokenizer
            from petals import AutoDistributedModelForCausalLM  # type: ignore
+            from transformers import AutoTokenizer
        except:
            raise Exception(
                "Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
@ -189,15 +193,15 @@ def completion(
        output_text = tokenizer.decode(outputs[0])

    if len(output_text) > 0:
-        model_response["choices"][0]["message"]["content"] = output_text
+        model_response.choices[0].message.content = output_text  # type: ignore

    prompt_tokens = len(encoding.encode(prompt))
    completion_tokens = len(
        encoding.encode(model_response["choices"][0]["message"].get("content"))
    )

-    model_response["created"] = int(time.time())
-    model_response["model"] = model
+    model_response.created = int(time.time())
+    model_response.model = model
    usage = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -279,7 +279,7 @@ class PredibaseChatCompletion(BaseLLM):
                    message=f"'generated_text' is not a key response dictionary - {completion_response}",
                )
            if len(completion_response["generated_text"]) > 0:
-                model_response["choices"][0]["message"]["content"] = self.output_parser(
+                model_response.choices[0].message.content = self.output_parser(  # type: ignore
                    completion_response["generated_text"]
                )
            ## GETTING LOGPROBS + FINISH REASON
@ -294,10 +294,10 @@ class PredibaseChatCompletion(BaseLLM):
                for token in completion_response["details"]["tokens"]:
                    if token["logprob"] is not None:
                        sum_logprob += token["logprob"]
-                model_response["choices"][0][
-                    "message"
-                ]._logprob = (
-                    sum_logprob  # [TODO] move this to using the actual logprobs
+                setattr(
+                    model_response.choices[0].message,  # type: ignore
+                    "_logprob",
+                    sum_logprob,  # [TODO] move this to using the actual logprobs
                )
            if "best_of" in optional_params and optional_params["best_of"] > 1:
                if (
@ -325,7 +325,7 @@ class PredibaseChatCompletion(BaseLLM):
                            message=message_obj,
                        )
                        choices_list.append(choice_obj)
-                    model_response["choices"].extend(choices_list)
+                    model_response.choices.extend(choices_list)

        ## CALCULATING USAGE
        prompt_tokens = 0
@ -351,8 +351,8 @@ class PredibaseChatCompletion(BaseLLM):

        total_tokens = prompt_tokens + completion_tokens

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -388,7 +388,7 @@ def process_response(

    ## Building RESPONSE OBJECT
    if len(result) > 1:
-        model_response["choices"][0]["message"]["content"] = result
+        model_response.choices[0].message.content = result  # type: ignore

    # Calculate usage
    prompt_tokens = len(encoding.encode(prompt, disallowed_special=()))
@ -398,7 +398,7 @@ def process_response(
            disallowed_special=(),
        )
    )
-    model_response["model"] = "replicate/" + model
+    model_response.model = "replicate/" + model
    usage = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -498,7 +498,7 @@ def completion(
    ## Step1: Start Prediction: gets a prediction url
    ## Step2: Poll prediction url for response
    ## Step2: is handled with and without streaming
-    model_response["created"] = int(
+    model_response.created = int(
        time.time()
    )  # for pricing this must remain right before calling api

--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -1,16 +1,21 @@
-import os, types, traceback
-from enum import Enum
-import json
-import requests  # type: ignore
-import time
-from typing import Callable, Optional, Any
-import litellm
-from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
-import sys
-from copy import deepcopy
-import httpx  # type: ignore
 import io
-from .prompt_templates.factory import prompt_factory, custom_prompt
+import json
+import os
+import sys
+import time
+import traceback
+import types
+from copy import deepcopy
+from enum import Enum
+from typing import Any, Callable, Optional
+
+import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
+from litellm.utils import EmbeddingResponse, ModelResponse, Usage, get_secret
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class SagemakerError(Exception):
@ -377,7 +382,7 @@ def completion(
        if completion_output.startswith(prompt) and "<s>" in prompt:
            completion_output = completion_output.replace(prompt, "", 1)

-        model_response["choices"][0]["message"]["content"] = completion_output
+        model_response.choices[0].message.content = completion_output  # type: ignore
    except:
        raise SagemakerError(
            message=f"LiteLLM Error: Unable to parse sagemaker RAW RESPONSE {json.dumps(completion_response)}",
@ -390,8 +395,8 @@ def completion(
        encoding.encode(model_response["choices"][0]["message"].get("content", ""))
    )

-    model_response["created"] = int(time.time())
-    model_response["model"] = model
+    model_response.created = int(time.time())
+    model_response.model = model
    usage = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
@ -597,7 +602,7 @@ async def async_completion(
            if completion_output.startswith(data["inputs"]) and "<s>" in data["inputs"]:
                completion_output = completion_output.replace(data["inputs"], "", 1)

-            model_response["choices"][0]["message"]["content"] = completion_output
+            model_response.choices[0].message.content = completion_output  # type: ignore
        except:
            raise SagemakerError(
                message=f"LiteLLM Error: Unable to parse sagemaker RAW RESPONSE {json.dumps(completion_response)}",
@ -610,8 +615,8 @@ async def async_completion(
            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
        )

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
@ -741,16 +746,20 @@ def embedding(
            {"object": "embedding", "index": idx, "embedding": embedding}
        )

-    model_response["object"] = "list"
-    model_response["data"] = output_data
-    model_response["model"] = model
+    model_response.object = "list"
+    model_response.data = output_data
+    model_response.model = model

    input_tokens = 0
    for text in input:
        input_tokens += len(encoding.encode(text))

-    model_response["usage"] = Usage(
-        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+    setattr(
+        model_response,
+        "usage",
+        Usage(
+            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+        ),
    )

    return model_response
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -3,16 +3,20 @@ Deprecated. We now do together ai calls via the openai client.
 Reference: https://docs.together.ai/docs/openai-api-compatibility
 """

-import os, types
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
+import types
+from enum import Enum
 from typing import Callable, Optional
-import litellm
+
 import httpx  # type: ignore
+import requests  # type: ignore
+
+import litellm
 from litellm.utils import ModelResponse, Usage
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class TogetherAIError(Exception):
@ -91,145 +95,145 @@ class TogetherAIConfig:
        }


-def validate_environment(api_key):
-    if api_key is None:
-        raise ValueError(
-            "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
-        )
-    headers = {
-        "accept": "application/json",
-        "content-type": "application/json",
-        "Authorization": "Bearer " + api_key,
-    }
-    return headers
+# def validate_environment(api_key):
+#     if api_key is None:
+#         raise ValueError(
+#             "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
+#         )
+#     headers = {
+#         "accept": "application/json",
+#         "content-type": "application/json",
+#         "Authorization": "Bearer " + api_key,
+#     }
+#     return headers


-def completion(
-    model: str,
-    messages: list,
-    api_base: str,
-    model_response: ModelResponse,
-    print_verbose: Callable,
-    encoding,
-    api_key,
-    logging_obj,
-    custom_prompt_dict={},
-    optional_params=None,
-    litellm_params=None,
-    logger_fn=None,
-):
-    headers = validate_environment(api_key)
+# def completion(
+#     model: str,
+#     messages: list,
+#     api_base: str,
+#     model_response: ModelResponse,
+#     print_verbose: Callable,
+#     encoding,
+#     api_key,
+#     logging_obj,
+#     custom_prompt_dict={},
+#     optional_params=None,
+#     litellm_params=None,
+#     logger_fn=None,
+# ):
+#     headers = validate_environment(api_key)

-    ## Load Config
-    config = litellm.TogetherAIConfig.get_config()
-    for k, v in config.items():
-        if (
-            k not in optional_params
-        ):  # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
-            optional_params[k] = v
+#     ## Load Config
+#     config = litellm.TogetherAIConfig.get_config()
+#     for k, v in config.items():
+#         if (
+#             k not in optional_params
+#         ):  # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
+#             optional_params[k] = v

-    print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
-    if model in custom_prompt_dict:
-        # check if the model has a registered custom prompt
-        model_prompt_details = custom_prompt_dict[model]
-        prompt = custom_prompt(
-            role_dict=model_prompt_details.get("roles", {}),
-            initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
-            final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
-            bos_token=model_prompt_details.get("bos_token", ""),
-            eos_token=model_prompt_details.get("eos_token", ""),
-            messages=messages,
-        )
-    else:
-        prompt = prompt_factory(
-            model=model,
-            messages=messages,
-            api_key=api_key,
-            custom_llm_provider="together_ai",
-        )  # api key required to query together ai model list
+#     print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
+#     if model in custom_prompt_dict:
+#         # check if the model has a registered custom prompt
+#         model_prompt_details = custom_prompt_dict[model]
+#         prompt = custom_prompt(
+#             role_dict=model_prompt_details.get("roles", {}),
+#             initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
+#             final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
+#             bos_token=model_prompt_details.get("bos_token", ""),
+#             eos_token=model_prompt_details.get("eos_token", ""),
+#             messages=messages,
+#         )
+#     else:
+#         prompt = prompt_factory(
+#             model=model,
+#             messages=messages,
+#             api_key=api_key,
+#             custom_llm_provider="together_ai",
+#         )  # api key required to query together ai model list

-    data = {
-        "model": model,
-        "prompt": prompt,
-        "request_type": "language-model-inference",
-        **optional_params,
-    }
+#     data = {
+#         "model": model,
+#         "prompt": prompt,
+#         "request_type": "language-model-inference",
+#         **optional_params,
+#     }

-    ## LOGGING
-    logging_obj.pre_call(
-        input=prompt,
-        api_key=api_key,
-        additional_args={
-            "complete_input_dict": data,
-            "headers": headers,
-            "api_base": api_base,
-        },
-    )
-    ## COMPLETION CALL
-    if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
-        response = requests.post(
-            api_base,
-            headers=headers,
-            data=json.dumps(data),
-            stream=optional_params["stream_tokens"],
-        )
-        return response.iter_lines()
-    else:
-        response = requests.post(api_base, headers=headers, data=json.dumps(data))
-        ## LOGGING
-        logging_obj.post_call(
-            input=prompt,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data},
-        )
-        print_verbose(f"raw model_response: {response.text}")
-        ## RESPONSE OBJECT
-        if response.status_code != 200:
-            raise TogetherAIError(
-                status_code=response.status_code, message=response.text
-            )
-        completion_response = response.json()
+#     ## LOGGING
+#     logging_obj.pre_call(
+#         input=prompt,
+#         api_key=api_key,
+#         additional_args={
+#             "complete_input_dict": data,
+#             "headers": headers,
+#             "api_base": api_base,
+#         },
+#     )
+#     ## COMPLETION CALL
+#     if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
+#         response = requests.post(
+#             api_base,
+#             headers=headers,
+#             data=json.dumps(data),
+#             stream=optional_params["stream_tokens"],
+#         )
+#         return response.iter_lines()
+#     else:
+#         response = requests.post(api_base, headers=headers, data=json.dumps(data))
+#         ## LOGGING
+#         logging_obj.post_call(
+#             input=prompt,
+#             api_key=api_key,
+#             original_response=response.text,
+#             additional_args={"complete_input_dict": data},
+#         )
+#         print_verbose(f"raw model_response: {response.text}")
+#         ## RESPONSE OBJECT
+#         if response.status_code != 200:
+#             raise TogetherAIError(
+#                 status_code=response.status_code, message=response.text
+#             )
+#         completion_response = response.json()

-        if "error" in completion_response:
-            raise TogetherAIError(
-                message=json.dumps(completion_response),
-                status_code=response.status_code,
-            )
-        elif "error" in completion_response["output"]:
-            raise TogetherAIError(
-                message=json.dumps(completion_response["output"]),
-                status_code=response.status_code,
-            )
+#         if "error" in completion_response:
+#             raise TogetherAIError(
+#                 message=json.dumps(completion_response),
+#                 status_code=response.status_code,
+#             )
+#         elif "error" in completion_response["output"]:
+#             raise TogetherAIError(
+#                 message=json.dumps(completion_response["output"]),
+#                 status_code=response.status_code,
+#             )

-        if len(completion_response["output"]["choices"][0]["text"]) >= 0:
-            model_response["choices"][0]["message"]["content"] = completion_response[
-                "output"
-            ]["choices"][0]["text"]
+#         if len(completion_response["output"]["choices"][0]["text"]) >= 0:
+#             model_response.choices[0].message.content = completion_response["output"][
+#                 "choices"
+#             ][0]["text"]

-        ## CALCULATING USAGE
-        print_verbose(
-            f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
-        )
-        prompt_tokens = len(encoding.encode(prompt))
-        completion_tokens = len(
-            encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-        )
-        if "finish_reason" in completion_response["output"]["choices"][0]:
-            model_response.choices[0].finish_reason = completion_response["output"][
-                "choices"
-            ][0]["finish_reason"]
-        model_response["created"] = int(time.time())
-        model_response["model"] = "together_ai/" + model
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=prompt_tokens + completion_tokens,
-        )
-        setattr(model_response, "usage", usage)
-        return model_response
+#         ## CALCULATING USAGE
+#         print_verbose(
+#             f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
+#         )
+#         prompt_tokens = len(encoding.encode(prompt))
+#         completion_tokens = len(
+#             encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+#         )
+#         if "finish_reason" in completion_response["output"]["choices"][0]:
+#             model_response.choices[0].finish_reason = completion_response["output"][
+#                 "choices"
+#             ][0]["finish_reason"]
+#         model_response["created"] = int(time.time())
+#         model_response["model"] = "together_ai/" + model
+#         usage = Usage(
+#             prompt_tokens=prompt_tokens,
+#             completion_tokens=completion_tokens,
+#             total_tokens=prompt_tokens + completion_tokens,
+#         )
+#         setattr(model_response, "usage", usage)
+#         return model_response


-def embedding():
-    # logic for parsing in - calling - parsing out model embedding calls
-    pass
+# def embedding():
+#     # logic for parsing in - calling - parsing out model embedding calls
+#     pass
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -852,16 +852,14 @@ def completion(

        ## RESPONSE OBJECT
        if isinstance(completion_response, litellm.Message):
-            model_response["choices"][0]["message"] = completion_response
+            model_response.choices[0].message = completion_response  # type: ignore
        elif len(str(completion_response)) > 0:
-            model_response["choices"][0]["message"]["content"] = str(
-                completion_response
-            )
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+            model_response.choices[0].message.content = str(completion_response)  # type: ignore
+        model_response.created = int(time.time())
+        model_response.model = model
        ## CALCULATING USAGE
        if model in litellm.vertex_language_models and response_obj is not None:
-            model_response["choices"][0].finish_reason = map_finish_reason(
+            model_response.choices[0].finish_reason = map_finish_reason(
                response_obj.candidates[0].finish_reason.name
            )
            usage = Usage(
@ -912,7 +910,7 @@ async def async_completion(
    request_str: str,
    print_verbose: Callable,
    logging_obj,
-    encoding=None,
+    encoding,
    client_options=None,
    instances=None,
    vertex_project=None,
@ -1088,16 +1086,16 @@ async def async_completion(

        ## RESPONSE OBJECT
        if isinstance(completion_response, litellm.Message):
-            model_response["choices"][0]["message"] = completion_response
+            model_response.choices[0].message = completion_response  # type: ignore
        elif len(str(completion_response)) > 0:
-            model_response["choices"][0]["message"]["content"] = str(
+            model_response.choices[0].message.content = str(  # type: ignore
                completion_response
            )
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        ## CALCULATING USAGE
        if model in litellm.vertex_language_models and response_obj is not None:
-            model_response["choices"][0].finish_reason = map_finish_reason(
+            model_response.choices[0].finish_reason = map_finish_reason(
                response_obj.candidates[0].finish_reason.name
            )
            usage = Usage(
@ -1377,16 +1375,16 @@ class VertexAITextEmbeddingConfig(BaseModel):
 def embedding(
    model: str,
    input: Union[list, str],
+    print_verbose,
+    model_response: litellm.EmbeddingResponse,
+    optional_params: dict,
    api_key: Optional[str] = None,
    logging_obj=None,
-    model_response=None,
-    optional_params=None,
    encoding=None,
    vertex_project=None,
    vertex_location=None,
    vertex_credentials=None,
    aembedding=False,
-    print_verbose=None,
 ):
    # logic for parsing in - calling - parsing out model embedding calls
    try:
@ -1484,15 +1482,15 @@ def embedding(
                "embedding": embedding.values,
            }
        )
-        input_tokens += embedding.statistics.token_count
-    model_response["object"] = "list"
-    model_response["data"] = embedding_response
-    model_response["model"] = model
+        input_tokens += embedding.statistics.token_count  # type: ignore
+    model_response.object = "list"
+    model_response.data = embedding_response
+    model_response.model = model

    usage = Usage(
        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
    )
-    model_response.usage = usage
+    setattr(model_response, "usage", usage)

    return model_response

@ -1500,8 +1498,8 @@ def embedding(
 async def async_embedding(
    model: str,
    input: Union[list, str],
+    model_response: litellm.EmbeddingResponse,
    logging_obj=None,
-    model_response=None,
    optional_params=None,
    encoding=None,
    client=None,
@ -1541,11 +1539,11 @@ async def async_embedding(
        )
        input_tokens += embedding.statistics.token_count

-    model_response["object"] = "list"
-    model_response["data"] = embedding_response
-    model_response["model"] = model
+    model_response.object = "list"
+    model_response.data = embedding_response
+    model_response.model = model
    usage = Usage(
        prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
    )
-    model_response.usage = usage
+    setattr(model_response, "usage", usage)
    return model_response
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@ -367,8 +367,8 @@ async def async_completion(
    prompt_tokens = message.usage.input_tokens
    completion_tokens = message.usage.output_tokens

-    model_response["created"] = int(time.time())
-    model_response["model"] = model
+    model_response.created = int(time.time())
+    model_response.model = model
    usage = Usage(
        prompt_tokens=prompt_tokens,
        completion_tokens=completion_tokens,
--- a/litellm/llms/vllm.py
+++ b/litellm/llms/vllm.py
@ -1,11 +1,15 @@
-import os
 import json
+import os
+import time  # type: ignore
 from enum import Enum
+from typing import Any, Callable
+
+import httpx
 import requests  # type: ignore
-import time, httpx  # type: ignore
-from typing import Callable, Any
+
 from litellm.utils import ModelResponse, Usage
-from .prompt_templates.factory import prompt_factory, custom_prompt
+
+from .prompt_templates.factory import custom_prompt, prompt_factory

 llm = None

@ -91,14 +95,14 @@ def completion(
        )
        print_verbose(f"raw model_response: {outputs}")
        ## RESPONSE OBJECT
-        model_response["choices"][0]["message"]["content"] = outputs[0].outputs[0].text
+        model_response.choices[0].message.content = outputs[0].outputs[0].text  # type: ignore

        ## CALCULATING USAGE
        prompt_tokens = len(outputs[0].prompt_token_ids)
        completion_tokens = len(outputs[0].outputs[0].token_ids)

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
@ -173,14 +177,14 @@ def batch_completions(
    for output in outputs:
        model_response = ModelResponse()
        ## RESPONSE OBJECT
-        model_response["choices"][0]["message"]["content"] = output.outputs[0].text
+        model_response.choices[0].message.content = output.outputs[0].text  # type: ignore

        ## CALCULATING USAGE
        prompt_tokens = len(output.prompt_token_ids)
        completion_tokens = len(output.outputs[0].token_ids)

-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+        model_response.created = int(time.time())
+        model_response.model = model
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
--- a/litellm/llms/watsonx.py
+++ b/litellm/llms/watsonx.py
@ -25,7 +25,13 @@ import requests  # type: ignore

 import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
-from litellm.utils import ModelResponse, Usage, get_secret
+from litellm.utils import (
+    EmbeddingResponse,
+    ModelResponse,
+    Usage,
+    get_secret,
+    map_finish_reason,
+)

 from .base import BaseLLM
 from .prompt_templates import factory as ptf
@ -414,14 +420,16 @@ class IBMWatsonXAI(BaseLLM):
        generated_text = json_resp["results"][0]["generated_text"]
        prompt_tokens = json_resp["results"][0]["input_token_count"]
        completion_tokens = json_resp["results"][0]["generated_token_count"]
-        model_response["choices"][0]["message"]["content"] = generated_text
-        model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
+        model_response.choices[0].message.content = generated_text  # type: ignore
+        model_response.choices[0].finish_reason = map_finish_reason(
+            json_resp["results"][0]["stop_reason"]
+        )
        if json_resp.get("created_at"):
-            model_response["created"] = datetime.fromisoformat(
-                json_resp["created_at"]
-            ).timestamp()
+            model_response.created = int(
+                datetime.fromisoformat(json_resp["created_at"]).timestamp()
+            )
        else:
-            model_response["created"] = int(time.time())
+            model_response.created = int(time.time())
        usage = Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
@ -463,7 +471,7 @@ class IBMWatsonXAI(BaseLLM):
        prompt = convert_messages_to_prompt(
            model, messages, provider, custom_prompt_dict
        )
-        model_response["model"] = model
+        model_response.model = model

        def process_stream_response(
            stream_resp: Union[Iterator[str], AsyncIterator],
@ -551,10 +559,10 @@ class IBMWatsonXAI(BaseLLM):
            raise WatsonXAIError(status_code=500, message=str(e))

    def _process_embedding_response(
-        self, json_resp: dict, model_response: Union[ModelResponse, None] = None
-    ) -> ModelResponse:
+        self, json_resp: dict, model_response: Optional[EmbeddingResponse] = None
+    ) -> EmbeddingResponse:
        if model_response is None:
-            model_response = ModelResponse(model=json_resp.get("model_id", None))
+            model_response = EmbeddingResponse(model=json_resp.get("model_id", None))
        results = json_resp.get("results", [])
        embedding_response = []
        for idx, result in enumerate(results):
@ -565,8 +573,8 @@ class IBMWatsonXAI(BaseLLM):
                    "embedding": result["embedding"],
                }
            )
-        model_response["object"] = "list"
-        model_response["data"] = embedding_response
+        model_response.object = "list"
+        model_response.data = embedding_response
        input_tokens = json_resp.get("input_token_count", 0)
        setattr(
            model_response,
@ -583,9 +591,9 @@ class IBMWatsonXAI(BaseLLM):
        self,
        model: str,
        input: Union[list, str],
+        model_response: litellm.EmbeddingResponse,
        api_key: Optional[str] = None,
        logging_obj=None,
-        model_response=None,
        optional_params=None,
        encoding=None,
        print_verbose=None,
@ -602,7 +610,7 @@ class IBMWatsonXAI(BaseLLM):
            if k not in optional_params:
                optional_params[k] = v

-        model_response["model"] = model
+        model_response.model = model

        # Load auth variables from environment variables
        if isinstance(input, str):
@ -635,12 +643,12 @@ class IBMWatsonXAI(BaseLLM):
        }
        request_manager = RequestManager(logging_obj)

-        def handle_embedding(request_params: dict) -> ModelResponse:
+        def handle_embedding(request_params: dict) -> EmbeddingResponse:
            with request_manager.request(request_params, input=input) as resp:
                json_resp = resp.json()
            return self._process_embedding_response(json_resp, model_response)

-        async def handle_aembedding(request_params: dict) -> ModelResponse:
+        async def handle_aembedding(request_params: dict) -> EmbeddingResponse:
            async with request_manager.async_request(
                request_params, input=input
            ) as resp:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -38,6 +38,7 @@ import dotenv
 import httpx
 import openai
 import tiktoken
+from pydantic import BaseModel
 from typing_extensions import overload

 import litellm
@ -48,6 +49,7 @@ from litellm import (  # type: ignore
    get_litellm_params,
    get_optional_params,
 )
+from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.utils import (
    CustomStreamWrapper,
@ -520,7 +522,7 @@ def mock_completion(
            )
            return response
        if n is None:
-            model_response["choices"][0]["message"]["content"] = mock_response
+            model_response.choices[0].message.content = mock_response  # type: ignore
        else:
            _all_choices = []
            for i in range(n):
@ -531,12 +533,12 @@ def mock_completion(
                    ),
                )
                _all_choices.append(_choice)
-            model_response["choices"] = _all_choices
-        model_response["created"] = int(time.time())
-        model_response["model"] = model
+            model_response.choices = _all_choices  # type: ignore
+        model_response.created = int(time.time())
+        model_response.model = model

        if mock_tool_calls:
-            model_response["choices"][0]["message"]["tool_calls"] = [
+            model_response.choices[0].message.tool_calls = [  # type: ignore
                ChatCompletionMessageToolCall(**tool_call)
                for tool_call in mock_tool_calls
            ]
@ -1932,51 +1934,7 @@ def completion(
            """
            Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
            """
-            custom_llm_provider = "together_ai"
-            together_ai_key = (
-                api_key
-                or litellm.togetherai_api_key
-                or get_secret("TOGETHER_AI_TOKEN")
-                or get_secret("TOGETHERAI_API_KEY")
-                or litellm.api_key
-            )
-
-            api_base = (
-                api_base
-                or litellm.api_base
-                or get_secret("TOGETHERAI_API_BASE")
-                or "https://api.together.xyz/inference"
-            )
-
-            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
-
-            model_response = together_ai.completion(
-                model=model,
-                messages=messages,
-                api_base=api_base,
-                model_response=model_response,
-                print_verbose=print_verbose,
-                optional_params=optional_params,
-                litellm_params=litellm_params,
-                logger_fn=logger_fn,
-                encoding=encoding,
-                api_key=together_ai_key,
-                logging_obj=logging,
-                custom_prompt_dict=custom_prompt_dict,
-            )
-            if (
-                "stream_tokens" in optional_params
-                and optional_params["stream_tokens"] == True
-            ):
-                # don't try to access stream object,
-                response = CustomStreamWrapper(
-                    model_response,
-                    model,
-                    custom_llm_provider="together_ai",
-                    logging_obj=logging,
-                )
-                return response
-            response = model_response
+            pass
        elif custom_llm_provider == "palm":
            palm_api_key = api_key or get_secret("PALM_API_KEY") or litellm.api_key

@ -2459,10 +2417,10 @@ def completion(

            ## LOGGING
            generator = ollama.get_ollama_response(
-                api_base,
-                model,
-                prompt,
-                optional_params,
+                api_base=api_base,
+                model=model,
+                prompt=prompt,
+                optional_params=optional_params,
                logging_obj=logging,
                acompletion=acompletion,
                model_response=model_response,
@ -2488,11 +2446,11 @@ def completion(
            )
            ## LOGGING
            generator = ollama_chat.get_ollama_response(
-                api_base,
-                api_key,
-                model,
-                messages,
-                optional_params,
+                api_base=api_base,
+                api_key=api_key,
+                model=model,
+                messages=messages,
+                optional_params=optional_params,
                logging_obj=logging,
                acompletion=acompletion,
                model_response=model_response,
@ -2670,9 +2628,9 @@ def completion(
            """
            string_response = response_json["data"][0]["output"][0]
            ## RESPONSE OBJECT
-            model_response["choices"][0]["message"]["content"] = string_response
-            model_response["created"] = int(time.time())
-            model_response["model"] = model
+            model_response.choices[0].message.content = string_response  # type: ignore
+            model_response.created = int(time.time())
+            model_response.model = model
            response = model_response
        else:
            raise ValueError(
@ -3463,7 +3421,7 @@ def embedding(
                or api_base
                or get_secret("OLLAMA_API_BASE")
                or "http://localhost:11434"
-            )
+            )  # type: ignore
            if isinstance(input, str):
                input = [input]
            if not all(isinstance(item, str) for item in input):
@ -3473,9 +3431,11 @@ def embedding(
                    llm_provider="ollama",  # type: ignore
                )
            ollama_embeddings_fn = (
-                ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
+                ollama.ollama_aembeddings
+                if aembedding is True
+                else ollama.ollama_embeddings
            )
-            response = ollama_embeddings_fn(
+            response = ollama_embeddings_fn(  # type: ignore
                api_base=api_base,
                model=model,
                prompts=input,
@ -3943,6 +3903,63 @@ def text_completion(
    return text_completion_response


+###### Adapter Completion ################
+
+
+async def aadapter_completion(*, adapter_id: str, **kwargs) -> Optional[BaseModel]:
+    """
+    Implemented to handle async calls for adapter_completion()
+    """
+    try:
+        translation_obj: Optional[CustomLogger] = None
+        for item in litellm.adapters:
+            if item["id"] == adapter_id:
+                translation_obj = item["adapter"]
+
+        if translation_obj is None:
+            raise ValueError(
+                "No matching adapter given. Received 'adapter_id'={}, litellm.adapters={}".format(
+                    adapter_id, litellm.adapters
+                )
+            )
+
+        new_kwargs = translation_obj.translate_completion_input_params(kwargs=kwargs)
+
+        response: ModelResponse = await acompletion(**new_kwargs)  # type: ignore
+
+        translated_response = translation_obj.translate_completion_output_params(
+            response=response
+        )
+
+        return translated_response
+    except Exception as e:
+        raise e
+
+
+def adapter_completion(*, adapter_id: str, **kwargs) -> Optional[BaseModel]:
+    translation_obj: Optional[CustomLogger] = None
+    for item in litellm.adapters:
+        if item["id"] == adapter_id:
+            translation_obj = item["adapter"]
+
+    if translation_obj is None:
+        raise ValueError(
+            "No matching adapter given. Received 'adapter_id'={}, litellm.adapters={}".format(
+                adapter_id, litellm.adapters
+            )
+        )
+
+    new_kwargs = translation_obj.translate_completion_input_params(kwargs=kwargs)
+
+    response: ModelResponse = completion(**new_kwargs)  # type: ignore
+
+    translated_response = translation_obj.translate_completion_output_params(
+        response=response
+    )
+
+    return translated_response
+
+
 ##### Moderation #######################


--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/aCz2wdplG6aqWrQnod4_6/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/aCz2wdplG6aqWrQnod4_6/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/aCz2wdplG6aqWrQnod4_6/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/aCz2wdplG6aqWrQnod4_6/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/131-19b05e5ce40fa85d.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/131-19b05e5ce40fa85d.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/131-6a03368053f9d26d.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/131-6a03368053f9d26d.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/759-d7572f2a46f911d5.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/759-d7572f2a46f911d5.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0cfbdaa2bf8fb022.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0cfbdaa2bf8fb022.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-1cc1412fb406fc70.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-1cc1412fb406fc70.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[48951,[\"665\",\"static/chunks/3014691f-589a5f4865c3822f.js\",\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-3264d0959a54279d.js\",\"931\",\"static/chunks/app/page-0cfbdaa2bf8fb022.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"LmgW0mreu0hjU2N9CAPDM\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[48951,[\"665\",\"static/chunks/3014691f-589a5f4865c3822f.js\",\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-19b05e5ce40fa85d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-d7572f2a46f911d5.js\",\"777\",\"static/chunks/777-3264d0959a54279d.js\",\"931\",\"static/chunks/app/page-1cc1412fb406fc70.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"aCz2wdplG6aqWrQnod4_6\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[48951,["665","static/chunks/3014691f-589a5f4865c3822f.js","936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-3264d0959a54279d.js","931","static/chunks/app/page-0cfbdaa2bf8fb022.js"],""]
+3:I[48951,["665","static/chunks/3014691f-589a5f4865c3822f.js","936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-19b05e5ce40fa85d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-d7572f2a46f911d5.js","777","static/chunks/777-3264d0959a54279d.js","931","static/chunks/app/page-1cc1412fb406fc70.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/model_hub.txt
+++ b/litellm/proxy/_experimental/out/model_hub.txt
@ -1,7 +1,7 @@
 2:I[77831,[],""]
-3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","777","static/chunks/777-3264d0959a54279d.js","418","static/chunks/app/model_hub/page-6575356e2cde4d07.js"],""]
+3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-19b05e5ce40fa85d.js","777","static/chunks/777-3264d0959a54279d.js","418","static/chunks/app/model_hub/page-6575356e2cde4d07.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_experimental/out/onboarding.txt
+++ b/litellm/proxy/_experimental/out/onboarding.txt
@ -2,6 +2,6 @@
 3:I[667,["665","static/chunks/3014691f-589a5f4865c3822f.js","294","static/chunks/294-0e35509d5ca95267.js","684","static/chunks/684-bb2d2f93d92acb0b.js","777","static/chunks/777-3264d0959a54279d.js","461","static/chunks/app/onboarding/page-c73480cdcfdbe5ac.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,19 +1,14 @@
 model_list:
-  - model_name: "*"
+  - model_name: azure-ai-mistral
    litellm_params:
-      model: "openai/*"
-  - model_name: gemini-1.5-flash
+      api_base: os.environ/AZURE_AI_MISTRAL_API_BASE
+      api_key: os.environ/AZURE_AI_MISTRAL_API_KEY
+      model: azure_ai/Mistral-large-nmefg
+  - model_name: azure-ai-phi
    litellm_params:
-      model: gemini/gemini-1.5-flash
-  - model_name: whisper
-    litellm_params:
-      model: azure/azure-whisper
-      api_version: 2024-02-15-preview
-      api_base: os.environ/AZURE_EUROPE_API_BASE
-      api_key: os.environ/AZURE_EUROPE_API_KEY
-    model_info:
-      mode: audio_transcription
-
+      api_base: os.environ/AZURE_AI_PHI_API_BASE
+      api_key: os.environ/AZURE_AI_PHI_API_KEY
+      model: azure_ai/Phi-3-medium-128k-instruct-fpmvj


 general_settings:
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -204,6 +204,10 @@ class LiteLLMRoutes(enum.Enum):
        # files
        "/v1/files",
        "/files",
+        "/v1/files/{file_id}",
+        "/files/{file_id}",
+        "/v1/files/{file_id}/content",
+        "/files/{file_id}/content",
        # assistants-related routes
        "/assistants",
        "/v1/assistants",
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@ -71,6 +71,11 @@ azure_api_key_header = APIKeyHeader(
    auto_error=False,
    description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
 )
+anthropic_api_key_header = APIKeyHeader(
+    name="x-api-key",
+    auto_error=False,
+    description="If anthropic client used.",
+)


 def _get_bearer_token(
@ -87,6 +92,9 @@ async def user_api_key_auth(
    request: Request,
    api_key: str = fastapi.Security(api_key_header),
    azure_api_key_header: str = fastapi.Security(azure_api_key_header),
+    anthropic_api_key_header: Optional[str] = fastapi.Security(
+        anthropic_api_key_header
+    ),
 ) -> UserAPIKeyAuth:

    from litellm.proxy.proxy_server import (
@ -114,6 +122,9 @@ async def user_api_key_auth(
        elif isinstance(azure_api_key_header, str):
            api_key = azure_api_key_header

+        elif isinstance(anthropic_api_key_header, str):
+            api_key = anthropic_api_key_header
+
        parent_otel_span: Optional[Span] = None
        if open_telemetry_logger is not None:
            parent_otel_span = open_telemetry_logger.tracer.start_span(
--- a/litellm/proxy/common_utils/debug_utils.py
+++ b/litellm/proxy/common_utils/debug_utils.py
@ -25,3 +25,38 @@ if os.environ.get("LITELLM_PROFILE", "false").lower() == "true":
            result.append(f"{stat.traceback.format()}: {stat.size / 1024} KiB")

        return {"top_50_memory_usage": result}
+
+
+@router.get("/otel-spans", include_in_schema=False)
+async def get_otel_spans():
+    from litellm.integrations.opentelemetry import OpenTelemetry
+    from litellm.proxy.proxy_server import open_telemetry_logger
+
+    open_telemetry_logger: OpenTelemetry = open_telemetry_logger
+    otel_exporter = open_telemetry_logger.OTEL_EXPORTER
+    recorded_spans = otel_exporter.get_finished_spans()
+
+    print("Spans: ", recorded_spans)  # noqa
+
+    most_recent_parent = None
+    most_recent_start_time = 1000000
+    spans_grouped_by_parent = {}
+    for span in recorded_spans:
+        if span.parent is not None:
+            parent_trace_id = span.parent.trace_id
+            if parent_trace_id not in spans_grouped_by_parent:
+                spans_grouped_by_parent[parent_trace_id] = []
+            spans_grouped_by_parent[parent_trace_id].append(span.name)
+
+            # check time of span
+            if span.start_time > most_recent_start_time:
+                most_recent_parent = parent_trace_id
+                most_recent_start_time = span.start_time
+
+    # these are otel spans - get the span name
+    span_names = [span.name for span in recorded_spans]
+    return {
+        "otel_spans": span_names,
+        "spans_grouped_by_parent": spans_grouped_by_parent,
+        "most_recent_parent": most_recent_parent,
+    }
--- a/litellm/proxy/common_utils/init_callbacks.py
+++ b/litellm/proxy/common_utils/init_callbacks.py
@ -35,6 +35,10 @@ def initialize_callbacks_on_proxy(

                open_telemetry_logger = OpenTelemetry()

+                # Add Otel as a service callback
+                if "otel" not in litellm.service_callback:
+                    litellm.service_callback.append("otel")
+
                imported_list.append(open_telemetry_logger)
                setattr(proxy_server, "open_telemetry_logger", open_telemetry_logger)
            elif isinstance(callback, str) and callback == "presidio":
--- a/litellm/proxy/example_config_yaml/otel_test_config.yaml
+++ b/litellm/proxy/example_config_yaml/otel_test_config.yaml
@ -0,0 +1,11 @@
+model_list:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+litellm_settings:
+  cache: true
+  callbacks: ["otel"]
+
--- a/litellm/proxy/health_endpoints/_health_endpoints.py
+++ b/litellm/proxy/health_endpoints/_health_endpoints.py
@ -406,6 +406,19 @@ async def active_callbacks():
    }


+def callback_name(callback):
+    if isinstance(callback, str):
+        return callback
+
+    try:
+        return callback.__name__
+    except AttributeError:
+        try:
+            return callback.__class__.__name__
+        except AttributeError:
+            return str(callback)
+
+
@router.get(
    "/health/readiness",
    tags=["health"],
@ -424,8 +437,8 @@ async def health_readiness():
        try:
            # this was returning a JSON of the values in some of the callbacks
            # all we need is the callback name, hence we do str(callback)
-            success_callback_names = [str(x) for x in litellm.success_callback]
-        except:
+            success_callback_names = [callback_name(x) for x in litellm.success_callback]
+        except AttributeError:
            # don't let this block the /health/readiness response, if we can't convert to str -> return litellm.success_callback
            success_callback_names = litellm.success_callback

--- a/litellm/proxy/openai_files_endpoints/files_endpoints.py
+++ b/litellm/proxy/openai_files_endpoints/files_endpoints.py
@ -0,0 +1,599 @@
+######################################################################
+
+#                          /v1/files Endpoints
+
+# Equivalent of https://platform.openai.com/docs/api-reference/files
+######################################################################
+
+import asyncio
+import traceback
+from datetime import datetime, timedelta, timezone
+from typing import List, Optional
+
+import fastapi
+import httpx
+from fastapi import (
+    APIRouter,
+    Depends,
+    File,
+    Form,
+    Header,
+    HTTPException,
+    Request,
+    Response,
+    UploadFile,
+    status,
+)
+
+import litellm
+from litellm import CreateFileRequest, FileContentRequest
+from litellm._logging import verbose_proxy_logger
+from litellm.batches.main import FileObject
+from litellm.proxy._types import *
+from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
+
+router = APIRouter()
+
+
+@router.post(
+    "/v1/files",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+@router.post(
+    "/files",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+async def create_file(
+    request: Request,
+    fastapi_response: Response,
+    purpose: str = Form(...),
+    file: UploadFile = File(...),
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Upload a file that can be used across - Assistants API, Batch API 
+    This is the equivalent of POST https://api.openai.com/v1/files
+
+    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/create
+
+    Example Curl
+    ```
+    curl http://localhost:4000/v1/files \
+        -H "Authorization: Bearer sk-1234" \
+        -F purpose="batch" \
+        -F file="@mydata.jsonl"
+
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        add_litellm_data_to_request,
+        general_settings,
+        get_custom_headers,
+        proxy_config,
+        proxy_logging_obj,
+        version,
+    )
+
+    data: Dict = {}
+    try:
+        # Use orjson to parse JSON data, orjson speeds up requests significantly
+        # Read the file content
+        file_content = await file.read()
+        # Prepare the data for forwarding
+
+        data = {"purpose": purpose}
+
+        # Include original request and headers in the data
+        data = await add_litellm_data_to_request(
+            data=data,
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # Prepare the file data according to FileTypes
+        file_data = (file.filename, file_content, file.content_type)
+
+        _create_file_request = CreateFileRequest(file=file_data, **data)
+
+        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
+        response = await litellm.acreate_file(
+            custom_llm_provider="openai", **_create_file_request
+        )
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            )
+        )
+
+        return response
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.create_file(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_msg = f"{str(e)}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
+
+
+@router.get(
+    "/v1/files/{file_id:path}",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+@router.get(
+    "/files/{file_id:path}",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+async def get_file(
+    request: Request,
+    fastapi_response: Response,
+    file_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Returns information about a specific file. that can be used across - Assistants API, Batch API 
+    This is the equivalent of GET https://api.openai.com/v1/files/{file_id}
+
+    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/retrieve
+
+    Example Curl
+    ```
+    curl http://localhost:4000/v1/files/file-abc123 \
+        -H "Authorization: Bearer sk-1234"
+
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        add_litellm_data_to_request,
+        general_settings,
+        get_custom_headers,
+        proxy_config,
+        proxy_logging_obj,
+        version,
+    )
+
+    data: Dict = {}
+    try:
+
+        # Include original request and headers in the data
+        data = await add_litellm_data_to_request(
+            data=data,
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
+        response = await litellm.afile_retrieve(
+            custom_llm_provider="openai", file_id=file_id, **data
+        )
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            )
+        )
+        return response
+
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.retrieve_file(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_msg = f"{str(e)}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
+
+
+@router.delete(
+    "/v1/files/{file_id:path}",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+@router.delete(
+    "/files/{file_id:path}",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+async def delete_file(
+    request: Request,
+    fastapi_response: Response,
+    file_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Deletes a specified file. that can be used across - Assistants API, Batch API 
+    This is the equivalent of DELETE https://api.openai.com/v1/files/{file_id}
+
+    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/delete
+
+    Example Curl
+    ```
+    curl http://localhost:4000/v1/files/file-abc123 \
+    -X DELETE \
+    -H "Authorization: Bearer $OPENAI_API_KEY"
+
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        add_litellm_data_to_request,
+        general_settings,
+        get_custom_headers,
+        proxy_config,
+        proxy_logging_obj,
+        version,
+    )
+
+    data: Dict = {}
+    try:
+
+        # Include original request and headers in the data
+        data = await add_litellm_data_to_request(
+            data=data,
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
+        response = await litellm.afile_delete(
+            custom_llm_provider="openai", file_id=file_id, **data
+        )
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            )
+        )
+        return response
+
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.retrieve_file(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_msg = f"{str(e)}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
+
+
+@router.get(
+    "/v1/files",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+@router.get(
+    "/files",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+async def list_files(
+    request: Request,
+    fastapi_response: Response,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    purpose: Optional[str] = None,
+):
+    """
+    Returns information about a specific file. that can be used across - Assistants API, Batch API 
+    This is the equivalent of GET https://api.openai.com/v1/files/
+
+    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/list
+
+    Example Curl
+    ```
+    curl http://localhost:4000/v1/files\
+        -H "Authorization: Bearer sk-1234"
+
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        add_litellm_data_to_request,
+        general_settings,
+        get_custom_headers,
+        proxy_config,
+        proxy_logging_obj,
+        version,
+    )
+
+    data: Dict = {}
+    try:
+
+        # Include original request and headers in the data
+        data = await add_litellm_data_to_request(
+            data=data,
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
+        response = await litellm.afile_list(
+            custom_llm_provider="openai", purpose=purpose, **data
+        )
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            )
+        )
+        return response
+
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.list_files(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_msg = f"{str(e)}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
+
+
+@router.get(
+    "/v1/files/{file_id:path}/content",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+@router.get(
+    "/files/{file_id:path}/content",
+    dependencies=[Depends(user_api_key_auth)],
+    tags=["files"],
+)
+async def get_file_content(
+    request: Request,
+    fastapi_response: Response,
+    file_id: str,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    """
+    Returns information about a specific file. that can be used across - Assistants API, Batch API 
+    This is the equivalent of GET https://api.openai.com/v1/files/{file_id}/content
+
+    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/retrieve-contents
+
+    Example Curl
+    ```
+    curl http://localhost:4000/v1/files/file-abc123/content \
+        -H "Authorization: Bearer sk-1234"
+
+    ```
+    """
+    from litellm.proxy.proxy_server import (
+        add_litellm_data_to_request,
+        general_settings,
+        get_custom_headers,
+        proxy_config,
+        proxy_logging_obj,
+        version,
+    )
+
+    data: Dict = {}
+    try:
+
+        # Include original request and headers in the data
+        data = await add_litellm_data_to_request(
+            data=data,
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
+        response = await litellm.afile_content(
+            custom_llm_provider="openai", file_id=file_id, **data
+        )
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        ### RESPONSE HEADERS ###
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
+            )
+        )
+        httpx_response: Optional[httpx.Response] = getattr(response, "response", None)
+        if httpx_response is None:
+            raise ValueError(
+                f"Invalid response - response.response is None - got {response}"
+            )
+        return Response(
+            content=httpx_response.content,
+            status_code=httpx_response.status_code,
+            headers=httpx_response.headers,
+        )
+
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.retrieve_file_content(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        if isinstance(e, HTTPException):
+            raise ProxyException(
+                message=getattr(e, "message", str(e.detail)),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
+            )
+        else:
+            error_msg = f"{str(e)}"
+            raise ProxyException(
+                message=getattr(e, "message", error_msg),
+                type=getattr(e, "type", "None"),
+                param=getattr(e, "param", "None"),
+                code=getattr(e, "status_code", 500),
+            )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -4,47 +4,14 @@ model_list:
      model: openai/fake
      api_key: fake-key
      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  - model_name: llama3
+  - model_name: gemini-flash
    litellm_params:
-      model: groq/llama3-8b-8192
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: gpt-3.5-turbo
-  - model_name: "*"
-    litellm_params:
-      model: openai/*
-      api_key: os.environ/OPENAI_API_KEY
-  - model_name: mistral-embed
-    litellm_params:
-      model: mistral/mistral-embed
+      model: gemini/gemini-1.5-flash

-general_settings:
-  pass_through_endpoints:
-    - path: "/v1/rerank"
-      target: "https://api.cohere.com/v1/rerank"
-      auth: true # 👈 Key change to use LiteLLM Auth / Keys
-      headers:
-        Authorization: "bearer os.environ/COHERE_API_KEY"
-        content-type: application/json
-        accept: application/json
-    - path: "/api/public/ingestion"
-      target: "https://us.cloud.langfuse.com/api/public/ingestion"
-      auth: true
-      headers:
-        LANGFUSE_PUBLIC_KEY: "os.environ/LANGFUSE_DEV_PUBLIC_KEY"
-        LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"
+general_settings: 
+  master_key: sk-1234 

 litellm_settings:
-  guardrails:
-    - prompt_injection:
-        callbacks: [lakera_prompt_injection, hide_secrets]
-        default_on: true
-    - hide_secrets:
-        callbacks: [hide_secrets]
-        default_on: true
-
-assistant_settings:
-  custom_llm_provider: openai
-  litellm_params: 
-    api_key: os.environ/OPENAI_API_KEY
+  cache: true
+  callbacks: ["otel"]

--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1,24 +1,18 @@
 import ast
 import asyncio
 import copy
-import hashlib
-import importlib
 import inspect
 import os
-import platform
 import random
-import re
 import secrets
-import shutil
 import subprocess
 import sys
-import threading
 import time
 import traceback
 import uuid
 import warnings
-from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, get_args
+from datetime import datetime, timedelta
+from typing import TYPE_CHECKING, Any, List, Optional

 import requests

@ -106,7 +100,6 @@ import litellm
 from litellm import (
    CancelBatchRequest,
    CreateBatchRequest,
-    CreateFileRequest,
    ListBatchRequest,
    RetrieveBatchRequest,
 )
@ -174,6 +167,9 @@ from litellm.proxy.management_endpoints.key_management_endpoints import (
    router as key_management_router,
 )
 from litellm.proxy.management_endpoints.team_endpoints import router as team_router
+from litellm.proxy.openai_files_endpoints.files_endpoints import (
+    router as openai_files_router,
+)
 from litellm.proxy.pass_through_endpoints.pass_through_endpoints import (
    initialize_pass_through_endpoints,
 )
@ -213,6 +209,12 @@ from litellm.router import (
 from litellm.router import ModelInfo as RouterModelInfo
 from litellm.router import updateDeployment
 from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
+from litellm.types.llms.anthropic import (
+    AnthropicMessagesRequest,
+    AnthropicResponse,
+    AnthropicResponseContentBlockText,
+    AnthropicResponseUsageBlock,
+)
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import RouterGeneralSettings

@ -2667,6 +2669,11 @@ async def startup_event():
 def model_list(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
+    """
+    Use `/model/info` - to get detailed model information, example - pricing, mode, etc.
+
+    This is just for compatibility with openai projects like aider.
+    """
    global llm_model_list, general_settings
    all_models = []
    ## CHECK IF MODEL RESTRICTIONS ARE SET AT KEY/TEAM LEVEL ##
@ -2791,7 +2798,7 @@ async def chat_completion(

        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
        ## IMPORTANT Note: - initialize this before running pre-call checks. Ensures we log rejected requests to langfuse.
-        data["litellm_call_id"] = str(uuid.uuid4())
+        data["litellm_call_id"] = request.headers.get('x-litellm-call-id', str(uuid.uuid4()))
        logging_obj, data = litellm.utils.function_setup(
            original_function="acompletion",
            rules_obj=litellm.utils.Rules(),
@ -3243,6 +3250,12 @@ async def completion(
    response_class=ORJSONResponse,
    tags=["embeddings"],
 )
+@router.post(
+    "/engines/{model:path}/embeddings",
+    dependencies=[Depends(user_api_key_auth)],
+    response_class=ORJSONResponse,
+    tags=["embeddings"],
+)  # azure compatible endpoint
@router.post(
    "/openai/deployments/{model:path}/embeddings",
    dependencies=[Depends(user_api_key_auth)],
@ -4891,117 +4904,6 @@ async def retrieve_batch(

 ######################################################################

-######################################################################
-
-#                          /v1/files Endpoints
-
-
-######################################################################
-@router.post(
-    "/v1/files",
-    dependencies=[Depends(user_api_key_auth)],
-    tags=["files"],
-)
-@router.post(
-    "/files",
-    dependencies=[Depends(user_api_key_auth)],
-    tags=["files"],
-)
-async def create_file(
-    request: Request,
-    fastapi_response: Response,
-    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-):
-    """
-    Upload a file that can be used across - Assistants API, Batch API 
-    This is the equivalent of POST https://api.openai.com/v1/files
-
-    Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/create
-
-    Example Curl
-    ```
-    curl https://api.openai.com/v1/files \
-        -H "Authorization: Bearer sk-1234" \
-        -F purpose="batch" \
-        -F file="@mydata.jsonl"
-
-    ```
-    """
-    global proxy_logging_obj
-    data: Dict = {}
-    try:
-        # Use orjson to parse JSON data, orjson speeds up requests significantly
-        form_data = await request.form()
-        data = {key: value for key, value in form_data.items() if key != "file"}
-
-        # Include original request and headers in the data
-        data = await add_litellm_data_to_request(
-            data=data,
-            request=request,
-            general_settings=general_settings,
-            user_api_key_dict=user_api_key_dict,
-            version=version,
-            proxy_config=proxy_config,
-        )
-
-        _create_file_request = CreateFileRequest()
-
-        # for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
-        response = await litellm.acreate_file(
-            custom_llm_provider="openai", **_create_file_request
-        )
-
-        ### ALERTING ###
-        asyncio.create_task(
-            proxy_logging_obj.update_request_status(
-                litellm_call_id=data.get("litellm_call_id", ""), status="success"
-            )
-        )
-
-        ### RESPONSE HEADERS ###
-        hidden_params = getattr(response, "_hidden_params", {}) or {}
-        model_id = hidden_params.get("model_id", None) or ""
-        cache_key = hidden_params.get("cache_key", None) or ""
-        api_base = hidden_params.get("api_base", None) or ""
-
-        fastapi_response.headers.update(
-            get_custom_headers(
-                user_api_key_dict=user_api_key_dict,
-                model_id=model_id,
-                cache_key=cache_key,
-                api_base=api_base,
-                version=version,
-                model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
-            )
-        )
-
-        return response
-    except Exception as e:
-        await proxy_logging_obj.post_call_failure_hook(
-            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
-        )
-        verbose_proxy_logger.error(
-            "litellm.proxy.proxy_server.create_file(): Exception occured - {}".format(
-                str(e)
-            )
-        )
-        verbose_proxy_logger.debug(traceback.format_exc())
-        if isinstance(e, HTTPException):
-            raise ProxyException(
-                message=getattr(e, "message", str(e.detail)),
-                type=getattr(e, "type", "None"),
-                param=getattr(e, "param", "None"),
-                code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
-            )
-        else:
-            error_msg = f"{str(e)}"
-            raise ProxyException(
-                message=getattr(e, "message", error_msg),
-                type=getattr(e, "type", "None"),
-                param=getattr(e, "param", "None"),
-                code=getattr(e, "status_code", 500),
-            )
-

@router.post(
    "/v1/moderations",
@ -5150,6 +5052,198 @@ async def moderations(
            )


+#### ANTHROPIC ENDPOINTS ####
+
+
+@router.post(
+    "/v1/messages",
+    tags=["[beta] Anthropic `/v1/messages`"],
+    dependencies=[Depends(user_api_key_auth)],
+    response_model=AnthropicResponse,
+)
+async def anthropic_response(
+    anthropic_data: AnthropicMessagesRequest,
+    fastapi_response: Response,
+    request: Request,
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+):
+    from litellm import adapter_completion
+    from litellm.adapters.anthropic_adapter import anthropic_adapter
+
+    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
+
+    global user_temperature, user_request_timeout, user_max_tokens, user_api_base
+    data: dict = {**anthropic_data, "adapter_id": "anthropic"}
+    try:
+        data["model"] = (
+            general_settings.get("completion_model", None)  # server default
+            or user_model  # model name passed via cli args
+            or data["model"]  # default passed in http request
+        )
+        if user_model:
+            data["model"] = user_model
+
+        data = await add_litellm_data_to_request(
+            data=data,  # type: ignore
+            request=request,
+            general_settings=general_settings,
+            user_api_key_dict=user_api_key_dict,
+            version=version,
+            proxy_config=proxy_config,
+        )
+
+        # override with user settings, these are params passed via cli
+        if user_temperature:
+            data["temperature"] = user_temperature
+        if user_request_timeout:
+            data["request_timeout"] = user_request_timeout
+        if user_max_tokens:
+            data["max_tokens"] = user_max_tokens
+        if user_api_base:
+            data["api_base"] = user_api_base
+
+        ### MODEL ALIAS MAPPING ###
+        # check if model name in model alias map
+        # get the actual model name
+        if data["model"] in litellm.model_alias_map:
+            data["model"] = litellm.model_alias_map[data["model"]]
+
+        ### CALL HOOKS ### - modify incoming data before calling the model
+        data = await proxy_logging_obj.pre_call_hook(  # type: ignore
+            user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion"
+        )
+
+        ### ROUTE THE REQUESTs ###
+        router_model_names = llm_router.model_names if llm_router is not None else []
+        # skip router if user passed their key
+        if "api_key" in data:
+            llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
+        elif (
+            llm_router is not None and data["model"] in router_model_names
+        ):  # model in router model list
+            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
+        elif (
+            llm_router is not None
+            and llm_router.model_group_alias is not None
+            and data["model"] in llm_router.model_group_alias
+        ):  # model set in model_group_alias
+            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
+        elif (
+            llm_router is not None and data["model"] in llm_router.deployment_names
+        ):  # model in router deployments, calling a specific deployment on the router
+            llm_response = asyncio.create_task(
+                llm_router.aadapter_completion(**data, specific_deployment=True)
+            )
+        elif (
+            llm_router is not None and data["model"] in llm_router.get_model_ids()
+        ):  # model in router model list
+            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
+        elif (
+            llm_router is not None
+            and data["model"] not in router_model_names
+            and llm_router.default_deployment is not None
+        ):  # model in router deployments, calling a specific deployment on the router
+            llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
+        elif user_model is not None:  # `litellm --model <your-model-name>`
+            llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail={
+                    "error": "completion: Invalid model name passed in model="
+                    + data.get("model", "")
+                },
+            )
+
+        # Await the llm_response task
+        response = await llm_response
+
+        hidden_params = getattr(response, "_hidden_params", {}) or {}
+        model_id = hidden_params.get("model_id", None) or ""
+        cache_key = hidden_params.get("cache_key", None) or ""
+        api_base = hidden_params.get("api_base", None) or ""
+        response_cost = hidden_params.get("response_cost", None) or ""
+
+        ### ALERTING ###
+        asyncio.create_task(
+            proxy_logging_obj.update_request_status(
+                litellm_call_id=data.get("litellm_call_id", ""), status="success"
+            )
+        )
+
+        verbose_proxy_logger.debug("final response: %s", response)
+
+        fastapi_response.headers.update(
+            get_custom_headers(
+                user_api_key_dict=user_api_key_dict,
+                model_id=model_id,
+                cache_key=cache_key,
+                api_base=api_base,
+                version=version,
+                response_cost=response_cost,
+            )
+        )
+
+        verbose_proxy_logger.info("\nResponse from Litellm:\n{}".format(response))
+        return response
+    except RejectedRequestError as e:
+        _data = e.request_data
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict,
+            original_exception=e,
+            request_data=_data,
+        )
+        if _data.get("stream", None) is not None and _data["stream"] == True:
+            _chat_response = litellm.ModelResponse()
+            _usage = litellm.Usage(
+                prompt_tokens=0,
+                completion_tokens=0,
+                total_tokens=0,
+            )
+            _chat_response.usage = _usage  # type: ignore
+            _chat_response.choices[0].message.content = e.message  # type: ignore
+            _iterator = litellm.utils.ModelResponseIterator(
+                model_response=_chat_response, convert_to_delta=True
+            )
+            _streaming_response = litellm.TextCompletionStreamWrapper(
+                completion_stream=_iterator,
+                model=_data.get("model", ""),
+            )
+
+            selected_data_generator = select_data_generator(
+                response=_streaming_response,
+                user_api_key_dict=user_api_key_dict,
+                request_data=data,
+            )
+
+            return StreamingResponse(
+                selected_data_generator,
+                media_type="text/event-stream",
+                headers={},
+            )
+        else:
+            _response = litellm.TextCompletionResponse()
+            _response.choices[0].text = e.message
+            return _response
+    except Exception as e:
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
+        )
+        verbose_proxy_logger.error(
+            "litellm.proxy.proxy_server.completion(): Exception occured - {}".format(
+                str(e)
+            )
+        )
+        verbose_proxy_logger.debug(traceback.format_exc())
+        error_msg = f"{str(e)}"
+        raise ProxyException(
+            message=getattr(e, "message", error_msg),
+            type=getattr(e, "type", "None"),
+            param=getattr(e, "param", "None"),
+            code=getattr(e, "status_code", 500),
+        )
+
+
 #### DEV UTILS ####

 # @router.get(
@ -9302,3 +9396,4 @@ app.include_router(caching_router)
 app.include_router(analytics_router)
 app.include_router(debugging_endpoints_router)
 app.include_router(ui_crud_endpoints_router)
+app.include_router(openai_files_router)
--- a/litellm/router.py
+++ b/litellm/router.py
@ -1765,6 +1765,125 @@ class Router:
                self.fail_calls[model] += 1
            raise e

+    async def aadapter_completion(
+        self,
+        adapter_id: str,
+        model: str,
+        is_retry: Optional[bool] = False,
+        is_fallback: Optional[bool] = False,
+        is_async: Optional[bool] = False,
+        **kwargs,
+    ):
+        try:
+            kwargs["model"] = model
+            kwargs["adapter_id"] = adapter_id
+            kwargs["original_function"] = self._aadapter_completion
+            kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
+            timeout = kwargs.get("request_timeout", self.timeout)
+            kwargs.setdefault("metadata", {}).update({"model_group": model})
+            response = await self.async_function_with_fallbacks(**kwargs)
+
+            return response
+        except Exception as e:
+            asyncio.create_task(
+                send_llm_exception_alert(
+                    litellm_router_instance=self,
+                    request_kwargs=kwargs,
+                    error_traceback_str=traceback.format_exc(),
+                    original_exception=e,
+                )
+            )
+            raise e
+
+    async def _aadapter_completion(self, adapter_id: str, model: str, **kwargs):
+        try:
+            verbose_router_logger.debug(
+                f"Inside _aadapter_completion()- model: {model}; kwargs: {kwargs}"
+            )
+            deployment = await self.async_get_available_deployment(
+                model=model,
+                messages=[{"role": "user", "content": "default text"}],
+                specific_deployment=kwargs.pop("specific_deployment", None),
+            )
+            kwargs.setdefault("metadata", {}).update(
+                {
+                    "deployment": deployment["litellm_params"]["model"],
+                    "model_info": deployment.get("model_info", {}),
+                    "api_base": deployment.get("litellm_params", {}).get("api_base"),
+                }
+            )
+            kwargs["model_info"] = deployment.get("model_info", {})
+            data = deployment["litellm_params"].copy()
+            model_name = data["model"]
+            for k, v in self.default_litellm_params.items():
+                if (
+                    k not in kwargs
+                ):  # prioritize model-specific params > default router params
+                    kwargs[k] = v
+                elif k == "metadata":
+                    kwargs[k].update(v)
+
+            potential_model_client = self._get_client(
+                deployment=deployment, kwargs=kwargs, client_type="async"
+            )
+            # check if provided keys == client keys #
+            dynamic_api_key = kwargs.get("api_key", None)
+            if (
+                dynamic_api_key is not None
+                and potential_model_client is not None
+                and dynamic_api_key != potential_model_client.api_key
+            ):
+                model_client = None
+            else:
+                model_client = potential_model_client
+            self.total_calls[model_name] += 1
+
+            response = litellm.aadapter_completion(
+                **{
+                    **data,
+                    "adapter_id": adapter_id,
+                    "caching": self.cache_responses,
+                    "client": model_client,
+                    "timeout": self.timeout,
+                    **kwargs,
+                }
+            )
+
+            rpm_semaphore = self._get_client(
+                deployment=deployment,
+                kwargs=kwargs,
+                client_type="max_parallel_requests",
+            )
+
+            if rpm_semaphore is not None and isinstance(
+                rpm_semaphore, asyncio.Semaphore
+            ):
+                async with rpm_semaphore:
+                    """
+                    - Check rpm limits before making the call
+                    - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
+                    """
+                    await self.async_routing_strategy_pre_call_checks(
+                        deployment=deployment
+                    )
+                    response = await response  # type: ignore
+            else:
+                await self.async_routing_strategy_pre_call_checks(deployment=deployment)
+                response = await response  # type: ignore
+
+            self.success_calls[model_name] += 1
+            verbose_router_logger.info(
+                f"litellm.aadapter_completion(model={model_name})\033[32m 200 OK\033[0m"
+            )
+            return response
+        except Exception as e:
+            verbose_router_logger.info(
+                f"litellm.aadapter_completion(model={model})\033[31m Exception {str(e)}\033[0m"
+            )
+            if model is not None:
+                self.fail_calls[model] += 1
+            raise e
+
    def embedding(
        self,
        model: str,
--- a/litellm/tests/langfuse.log
+++ b/litellm/tests/langfuse.log
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -237,6 +237,8 @@ async def test_langfuse_logging_without_request_response(stream, langfuse_client
        assert _trace_data[0].output == {
            "role": "assistant",
            "content": "redacted-by-litellm",
+            "function_call": None,
+            "tool_calls": None,
        }

    except Exception as e:
@ -273,7 +275,12 @@ async def test_langfuse_masked_input_output(langfuse_client):
        expected_output = (
            "redacted-by-litellm"
            if mask_value
-            else {"content": "This is a test response", "role": "assistant"}
+            else {
+                "content": "This is a test response",
+                "role": "assistant",
+                "function_call": None,
+                "tool_calls": None,
+            }
        )
        langfuse_client.flush()
        await asyncio.sleep(2)
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@ -0,0 +1,103 @@
+# What is this?
+## Unit tests for Anthropic Adapter
+
+import asyncio
+import os
+import sys
+import traceback
+
+from dotenv import load_dotenv
+
+load_dotenv()
+import io
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import litellm
+from litellm import AnthropicConfig, Router, adapter_completion
+from litellm.adapters.anthropic_adapter import anthropic_adapter
+from litellm.types.llms.anthropic import AnthropicResponse
+
+
+def test_anthropic_completion_messages_translation():
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+
+    translated_messages = AnthropicConfig().translate_anthropic_messages_to_openai(messages=messages)  # type: ignore
+
+    assert translated_messages == [{"role": "user", "content": "Hey, how's it going?"}]
+
+
+def test_anthropic_completion_input_translation():
+    data = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hey, how's it going?"}],
+    }
+    translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
+
+    assert translated_input is not None
+
+    assert translated_input["model"] == "gpt-3.5-turbo"
+    assert translated_input["messages"] == [
+        {"role": "user", "content": "Hey, how's it going?"}
+    ]
+
+
+def test_anthropic_completion_e2e():
+    litellm.set_verbose = True
+
+    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
+
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+    response = adapter_completion(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        adapter_id="anthropic",
+        mock_response="This is a fake call",
+    )
+
+    print("Response: {}".format(response))
+
+    assert response is not None
+
+    assert isinstance(response, AnthropicResponse)
+
+
+@pytest.mark.asyncio
+async def test_anthropic_router_completion_e2e():
+    litellm.set_verbose = True
+
+    litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "claude-3-5-sonnet-20240620",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "mock_response": "hi this is macintosh.",
+                },
+            }
+        ]
+    )
+    messages = [{"role": "user", "content": "Hey, how's it going?"}]
+
+    response = await router.aadapter_completion(
+        model="claude-3-5-sonnet-20240620",
+        messages=messages,
+        adapter_id="anthropic",
+        mock_response="This is a fake call",
+    )
+
+    print("Response: {}".format(response))
+
+    assert response is not None
+
+    assert isinstance(response, AnthropicResponse)
+
+    assert response.model == "gpt-3.5-turbo"
--- a/litellm/tests/test_async_opentelemetry.py
+++ b/litellm/tests/test_async_opentelemetry.py
@ -1,21 +1,20 @@
 import asyncio
-import litellm
-
-from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
-from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
-from litellm._logging import verbose_logger
 import logging
 import time
+
 import pytest
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig

 verbose_logger.setLevel(logging.DEBUG)


-@pytest.mark.skip(
-    reason="new test. WIP. works locally but not on CI. Still figuring this out"
-)
@pytest.mark.asyncio
-async def test_otel_callback():
+@pytest.mark.skip(reason="Local only test. WIP.")
+async def test_async_otel_callback():
    exporter = InMemorySpanExporter()
    litellm.set_verbose = True
    litellm.callbacks = [OpenTelemetry(OpenTelemetryConfig(exporter=exporter))]
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import anthropic_messages_pt

-# litellm.num_retries = 3
+# litellm.num_retries=3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
@ -3065,32 +3065,38 @@ def response_format_tests(response: litellm.ModelResponse):
@pytest.mark.asyncio
 async def test_completion_bedrock_httpx_models(sync_mode, model):
    litellm.set_verbose = True
+    try:

-    if sync_mode:
-        response = completion(
-            model=model,
-            messages=[{"role": "user", "content": "Hey! how's it going?"}],
-            temperature=0.2,
-            max_tokens=200,
-        )
+        if sync_mode:
+            response = completion(
+                model=model,
+                messages=[{"role": "user", "content": "Hey! how's it going?"}],
+                temperature=0.2,
+                max_tokens=200,
+            )

-        assert isinstance(response, litellm.ModelResponse)
+            assert isinstance(response, litellm.ModelResponse)

-        response_format_tests(response=response)
-    else:
-        response = await litellm.acompletion(
-            model=model,
-            messages=[{"role": "user", "content": "Hey! how's it going?"}],
-            temperature=0.2,
-            max_tokens=100,
-        )
+            response_format_tests(response=response)
+        else:
+            response = await litellm.acompletion(
+                model=model,
+                messages=[{"role": "user", "content": "Hey! how's it going?"}],
+                temperature=0.2,
+                max_tokens=100,
+            )

-        assert isinstance(response, litellm.ModelResponse)
+            assert isinstance(response, litellm.ModelResponse)
+
+            print(f"response: {response}")
+            response_format_tests(response=response)

        print(f"response: {response}")
-        response_format_tests(response=response)
-
-    print(f"response: {response}")
+    except litellm.RateLimitError as e:
+        print("got rate limit error=", e)
+        pass
+    except Exception as e:
+        pytest.fail(f"An error occurred - {str(e)}")


 def test_completion_bedrock_titan_null_response():
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -712,6 +712,79 @@ def test_vertex_ai_claude_completion_cost():
    assert cost == predicted_cost


+def test_vertex_ai_embedding_completion_cost(caplog):
+    """
+    Relevant issue - https://github.com/BerriAI/litellm/issues/4630
+    """
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    text = "The quick brown fox jumps over the lazy dog."
+    input_tokens = litellm.token_counter(
+        model="vertex_ai/textembedding-gecko", text=text
+    )
+
+    model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
+
+    print("\nExpected model info:\n{}\n\n".format(model_info))
+
+    expected_input_cost = input_tokens * model_info["input_cost_per_token"]
+
+    ## CALCULATED COST
+    calculated_input_cost, calculated_output_cost = cost_per_token(
+        model="textembedding-gecko",
+        custom_llm_provider="vertex_ai",
+        prompt_tokens=input_tokens,
+        call_type="aembedding",
+    )
+
+    assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
+    print("expected_input_cost: {}".format(expected_input_cost))
+    print("calculated_input_cost: {}".format(calculated_input_cost))
+
+    captured_logs = [rec.message for rec in caplog.records]
+    for item in captured_logs:
+        print("\nitem:{}\n".format(item))
+        if (
+            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured "
+            in item
+        ):
+            raise Exception("Error log raised for calculating embedding cost")
+
+
+# def test_vertex_ai_embedding_completion_cost_e2e():
+#     """
+#     Relevant issue - https://github.com/BerriAI/litellm/issues/4630
+#     """
+#     from litellm.tests.test_amazing_vertex_completion import load_vertex_ai_credentials
+
+#     load_vertex_ai_credentials()
+#     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+#     litellm.model_cost = litellm.get_model_cost_map(url="")
+
+#     text = "The quick brown fox jumps over the lazy dog."
+#     input_tokens = litellm.token_counter(
+#         model="vertex_ai/textembedding-gecko", text=text
+#     )
+
+#     model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
+
+#     print("\nExpected model info:\n{}\n\n".format(model_info))
+
+#     expected_input_cost = input_tokens * model_info["input_cost_per_token"]
+
+#     ## CALCULATED COST
+#     resp = litellm.embedding(model="textembedding-gecko", input=[text])
+
+#     calculated_input_cost = resp._hidden_params["response_cost"]
+
+#     assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
+#     print("expected_input_cost: {}".format(expected_input_cost))
+#     print("calculated_input_cost: {}".format(calculated_input_cost))
+
+#     assert False
+
+
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_completion_cost_hidden_params(sync_mode):
--- a/litellm/tests/test_get_model_info.py
+++ b/litellm/tests/test_get_model_info.py
@ -1,13 +1,16 @@
 # What is this?
 ## Unit testing for the 'get_model_info()' function
-import os, sys, traceback
+import os
+import sys
+import traceback

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
+import pytest
+
 import litellm
 from litellm import get_model_info
-import pytest


 def test_get_model_info_simple_model_name():
@ -37,3 +40,9 @@ def test_get_model_info_custom_llm_with_same_name_vllm():
        pytest.fail("Expected get model info to fail for an unmapped model/provider")
    except Exception:
        pass
+
+
+def test_get_model_info_shows_correct_supports_vision():
+    info = litellm.get_model_info("gemini/gemini-1.5-flash")
+    print("info", info)
+    assert info["supports_vision"] is True
--- a/litellm/tests/test_openai_batches_and_files.py
+++ b/litellm/tests/test_openai_batches_and_files.py
@ -1,22 +1,26 @@
 # What is this?
 ## Unit Tests for OpenAI Batches API
-import sys, os, json
-import traceback
 import asyncio
+import json
+import os
+import sys
+import traceback
+
 from dotenv import load_dotenv

 load_dotenv()
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
-import pytest, logging, asyncio
-import litellm
-from litellm import (
-    create_batch,
-    create_file,
-)
+import asyncio
+import logging
 import time

+import pytest
+
+import litellm
+from litellm import create_batch, create_file
+

 def test_create_batch():
    """
@ -144,6 +148,28 @@ async def test_async_create_batch():

    print("file content = ", file_content)

+    # file obj
+    file_obj = await litellm.afile_retrieve(
+        file_id=batch_input_file_id, custom_llm_provider="openai"
+    )
+    print("file obj = ", file_obj)
+    assert file_obj.id == batch_input_file_id
+
+    # delete file
+    delete_file_response = await litellm.afile_delete(
+        file_id=batch_input_file_id, custom_llm_provider="openai"
+    )
+
+    print("delete file response = ", delete_file_response)
+
+    assert delete_file_response.id == batch_input_file_id
+
+    all_files_list = await litellm.afile_list(
+        custom_llm_provider="openai",
+    )
+
+    print("all_files_list = ", all_files_list)
+
    # # write this file content to a file
    # with open("file_content.json", "w") as f:
    #     json.dump(file_content, f)
--- a/litellm/tests/test_proxy_routes.py
+++ b/litellm/tests/test_proxy_routes.py
@ -20,7 +20,7 @@ import pytest
 import litellm
 from litellm.proxy._types import LiteLLMRoutes
 from litellm.proxy.auth.auth_utils import is_openai_route
-from litellm.proxy.proxy_server import router
+from litellm.proxy.proxy_server import app

 # Configure logging
 logging.basicConfig(
@ -37,7 +37,7 @@ def test_routes_on_litellm_proxy():
    this prevents accidentelly deleting /threads, or /batches etc
    """
    _all_routes = []
-    for route in router.routes:
+    for route in app.routes:

        _path_as_str = str(route.path)
        if ":path" in _path_as_str:
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -21,6 +21,8 @@ sys.path.insert(
 from dotenv import load_dotenv

 load_dotenv()
+import random
+
 import litellm
 from litellm import (
    AuthenticationError,
@ -1373,7 +1375,8 @@ async def test_bedrock_httpx_streaming(sync_mode, model):
            if complete_response.strip() == "":
                raise Exception("Empty response received")
        print(f"completion_response: {complete_response}\n\nFinalChunk: {final_chunk}")
-    except RateLimitError:
+    except RateLimitError as e:
+        print("got rate limit error=", e)
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@ -3037,8 +3040,11 @@ def test_completion_claude_3_function_call_with_streaming():


@pytest.mark.parametrize(
-    "model", ["gemini/gemini-1.5-flash"]
-)  # "claude-3-opus-20240229",
+    "model",
+    [
+        "gemini/gemini-1.5-flash",
+    ],  #  "claude-3-opus-20240229"
+)  #
@pytest.mark.asyncio
 async def test_acompletion_claude_3_function_call_with_streaming(model):
    litellm.set_verbose = True
@ -3046,41 +3052,45 @@ async def test_acompletion_claude_3_function_call_with_streaming(model):
        {
            "type": "function",
            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
+                "name": "generate_series_of_questions",
+                "description": "Generate a series of questions, given a topic.",
                "parameters": {
                    "type": "object",
                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "The city and state, e.g. San Francisco, CA",
+                        "questions": {
+                            "type": "array",
+                            "description": "The questions to be generated.",
+                            "items": {"type": "string"},
                        },
-                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                    },
-                    "required": ["location"],
+                    "required": ["questions"],
                },
            },
-        }
+        },
    ]
+    SYSTEM_PROMPT = "You are an AI assistant"
    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
-            "content": "What's the weather like in Boston today in fahrenheit?",
-        }
+            "content": "Generate 3 questions about civil engineering.",
+        },
    ]
    try:
        # test without max tokens
        response = await acompletion(
            model=model,
+            # model="claude-3-5-sonnet-20240620",
            messages=messages,
-            tools=tools,
-            tool_choice="required",
            stream=True,
+            temperature=0.75,
+            tools=tools,
+            stream_options={"include_usage": True},
        )
        idx = 0
        print(f"response: {response}")
        async for chunk in response:
-            # print(f"chunk: {chunk}")
+            print(f"chunk in test: {chunk}")
            if idx == 0:
                assert (
                    chunk.choices[0].delta.tool_calls[0].function.arguments is not None
@ -3510,3 +3520,56 @@ def test_unit_test_custom_stream_wrapper_function_call():
        if chunk.choices[0].finish_reason is not None:
            finish_reason = chunk.choices[0].finish_reason
    assert finish_reason == "tool_calls"
+
+    ## UNIT TEST RECREATING MODEL RESPONSE
+    from litellm.types.utils import (
+        ChatCompletionDeltaToolCall,
+        Delta,
+        Function,
+        StreamingChoices,
+        Usage,
+    )
+
+    initial_model_response = litellm.ModelResponse(
+        id="chatcmpl-842826b6-75a1-4ed4-8a68-7655e60654b3",
+        choices=[
+            StreamingChoices(
+                finish_reason=None,
+                index=0,
+                delta=Delta(
+                    content="",
+                    role="assistant",
+                    function_call=None,
+                    tool_calls=[
+                        ChatCompletionDeltaToolCall(
+                            id="7ee88721-bfee-4584-8662-944a23d4c7a5",
+                            function=Function(
+                                arguments='{"questions": ["What are the main challenges facing civil engineers today?", "How has technology impacted the field of civil engineering?", "What are some of the most innovative projects in civil engineering in recent years?"]}',
+                                name="generate_series_of_questions",
+                            ),
+                            type="function",
+                            index=0,
+                        )
+                    ],
+                ),
+                logprobs=None,
+            )
+        ],
+        created=1720755257,
+        model="gemini-1.5-flash",
+        object="chat.completion.chunk",
+        system_fingerprint=None,
+        usage=Usage(prompt_tokens=67, completion_tokens=55, total_tokens=122),
+        stream=True,
+    )
+
+    obj_dict = initial_model_response.dict()
+
+    if "usage" in obj_dict:
+        del obj_dict["usage"]
+
+    new_model = response.model_response_creator(chunk=obj_dict)
+
+    print("\n\n{}\n\n".format(new_model))
+
+    assert len(new_model.choices[0].delta.tool_calls) > 0
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@ -258,6 +258,13 @@ def test_validate_environment_empty_model():
        raise Exception()


+def test_validate_environment_api_key():
+    response_obj = validate_environment(model="gpt-3.5-turbo", api_key="sk-my-test-key")
+    assert (
+        response_obj["keys_in_environment"] is True
+    ), f"Missing keys={response_obj['missing_keys']}"
+
+
@mock.patch.dict(os.environ, {"OLLAMA_API_BASE": "foo"}, clear=True)
 def test_validate_environment_ollama():
    for provider in ["ollama", "ollama_chat"]:
--- a/litellm/types/adapter.py
+++ b/litellm/types/adapter.py
@ -0,0 +1,10 @@
+from typing import List
+
+from typing_extensions import Dict, Required, TypedDict, override
+
+from litellm.integrations.custom_logger import CustomLogger
+
+
+class AdapterItem(TypedDict):
+    id: str
+    adapter: CustomLogger
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@ -9,25 +9,27 @@ class AnthropicMessagesToolChoice(TypedDict, total=False):
    name: str


-class AnthopicMessagesAssistantMessageTextContentParam(TypedDict, total=False):
-    type: Required[Literal["text"]]
+class AnthropicMessagesTool(TypedDict, total=False):
+    name: Required[str]
+    description: str
+    input_schema: Required[dict]

+
+class AnthropicMessagesTextParam(TypedDict):
+    type: Literal["text"]
    text: str


-class AnthopicMessagesAssistantMessageToolCallParam(TypedDict, total=False):
-    type: Required[Literal["tool_use"]]
-
+class AnthropicMessagesToolUseParam(TypedDict):
+    type: Literal["tool_use"]
    id: str
-
    name: str
-
    input: dict


 AnthropicMessagesAssistantMessageValues = Union[
-    AnthopicMessagesAssistantMessageTextContentParam,
-    AnthopicMessagesAssistantMessageToolCallParam,
+    AnthropicMessagesTextParam,
+    AnthropicMessagesToolUseParam,
 ]


@ -46,6 +48,72 @@ class AnthopicMessagesAssistantMessageParam(TypedDict, total=False):
    """


+class AnthropicImageParamSource(TypedDict):
+    type: Literal["base64"]
+    media_type: str
+    data: str
+
+
+class AnthropicMessagesImageParam(TypedDict):
+    type: Literal["image"]
+    source: AnthropicImageParamSource
+
+
+class AnthropicMessagesToolResultContent(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class AnthropicMessagesToolResultParam(TypedDict, total=False):
+    type: Required[Literal["tool_result"]]
+    tool_use_id: Required[str]
+    is_error: bool
+    content: Union[
+        str,
+        Iterable[
+            Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam]
+        ],
+    ]
+
+
+AnthropicMessagesUserMessageValues = Union[
+    AnthropicMessagesTextParam,
+    AnthropicMessagesImageParam,
+    AnthropicMessagesToolResultParam,
+]
+
+
+class AnthropicMessagesUserMessageParam(TypedDict, total=False):
+    role: Required[Literal["user"]]
+    content: Required[Union[str, Iterable[AnthropicMessagesUserMessageValues]]]
+
+
+class AnthropicMetadata(TypedDict, total=False):
+    user_id: str
+
+
+class AnthropicMessagesRequest(TypedDict, total=False):
+    model: Required[str]
+    messages: Required[
+        List[
+            Union[
+                AnthropicMessagesUserMessageParam,
+                AnthopicMessagesAssistantMessageParam,
+            ]
+        ]
+    ]
+    max_tokens: Required[int]
+    metadata: AnthropicMetadata
+    stop_sequences: List[str]
+    stream: bool
+    system: str
+    temperature: float
+    tool_choice: AnthropicMessagesToolChoice
+    tools: List[AnthropicMessagesTool]
+    top_k: int
+    top_p: float
+
+
 class ContentTextBlockDelta(TypedDict):
    """
    'delta': {'type': 'text_delta', 'text': 'Hello'}
@ -155,3 +223,51 @@ class MessageStartBlock(TypedDict):

    type: Literal["message_start"]
    message: MessageChunk
+
+
+class AnthropicResponseContentBlockText(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class AnthropicResponseContentBlockToolUse(BaseModel):
+    type: Literal["tool_use"]
+    id: str
+    name: str
+    input: dict
+
+
+class AnthropicResponseUsageBlock(BaseModel):
+    input_tokens: int
+    output_tokens: int
+
+
+AnthropicFinishReason = Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
+
+
+class AnthropicResponse(BaseModel):
+    id: str
+    """Unique object identifier."""
+
+    type: Literal["message"]
+    """For Messages, this is always "message"."""
+
+    role: Literal["assistant"]
+    """Conversational role of the generated message. This will always be "assistant"."""
+
+    content: List[
+        Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
+    ]
+    """Content generated by the model."""
+
+    model: str
+    """The model that handled the request."""
+
+    stop_reason: Optional[AnthropicFinishReason]
+    """The reason that we stopped."""
+
+    stop_sequence: Optional[str]
+    """Which custom stop sequence was generated, if any."""
+
+    usage: AnthropicResponseUsageBlock
+    """Billing and rate-limit usage."""
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -305,7 +305,13 @@ class ChatCompletionToolCallFunctionChunk(TypedDict, total=False):
    arguments: str


-class ChatCompletionToolCallChunk(TypedDict):
+class ChatCompletionAssistantToolCall(TypedDict):
+    id: Optional[str]
+    type: Literal["function"]
+    function: ChatCompletionToolCallFunctionChunk
+
+
+class ChatCompletionToolCallChunk(TypedDict):  # result of /chat/completions call
    id: Optional[str]
    type: Literal["function"]
    function: ChatCompletionToolCallFunctionChunk
@ -319,6 +325,107 @@ class ChatCompletionDeltaToolCallChunk(TypedDict, total=False):
    index: int


+class ChatCompletionTextObject(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class ChatCompletionImageUrlObject(TypedDict, total=False):
+    url: Required[str]
+    detail: str
+
+
+class ChatCompletionImageObject(TypedDict):
+    type: Literal["image_url"]
+    image_url: ChatCompletionImageUrlObject
+
+
+class ChatCompletionUserMessage(TypedDict):
+    role: Literal["user"]
+    content: Union[
+        str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
+    ]
+
+
+class ChatCompletionAssistantMessage(TypedDict, total=False):
+    role: Required[Literal["assistant"]]
+    content: Optional[str]
+    name: str
+    tool_calls: List[ChatCompletionAssistantToolCall]
+
+
+class ChatCompletionToolMessage(TypedDict):
+    role: Literal["tool"]
+    content: str
+    tool_call_id: str
+
+
+class ChatCompletionSystemMessage(TypedDict, total=False):
+    role: Required[Literal["system"]]
+    content: Required[str]
+    name: str
+
+
+AllMessageValues = Union[
+    ChatCompletionUserMessage,
+    ChatCompletionAssistantMessage,
+    ChatCompletionToolMessage,
+    ChatCompletionSystemMessage,
+]
+
+
+class ChatCompletionToolChoiceFunctionParam(TypedDict):
+    name: str
+
+
+class ChatCompletionToolChoiceObjectParam(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionToolChoiceFunctionParam
+
+
+ChatCompletionToolChoiceStringValues = Literal["none", "auto", "required"]
+
+ChatCompletionToolChoiceValues = Union[
+    ChatCompletionToolChoiceStringValues, ChatCompletionToolChoiceObjectParam
+]
+
+
+class ChatCompletionToolParamFunctionChunk(TypedDict, total=False):
+    name: Required[str]
+    description: str
+    parameters: dict
+
+
+class ChatCompletionToolParam(TypedDict):
+    type: Literal["function"]
+    function: ChatCompletionToolParamFunctionChunk
+
+
+class ChatCompletionRequest(TypedDict, total=False):
+    model: Required[str]
+    messages: Required[List[AllMessageValues]]
+    frequency_penalty: float
+    logit_bias: dict
+    logprobs: bool
+    top_logprobs: int
+    max_tokens: int
+    n: int
+    presence_penalty: float
+    response_format: dict
+    seed: int
+    service_tier: str
+    stop: Union[str, List[str]]
+    stream_options: dict
+    temperature: float
+    top_p: float
+    tools: List[ChatCompletionToolParam]
+    tool_choice: ChatCompletionToolChoiceValues
+    parallel_tool_calls: bool
+    function_call: Union[str, dict]
+    functions: List
+    user: str
+
+
 class ChatCompletionDeltaChunk(TypedDict, total=False):
    content: Optional[str]
    tool_calls: List[ChatCompletionDeltaToolCallChunk]
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -73,6 +73,7 @@ class ModelInfo(TypedDict, total=False):
    supported_openai_params: Required[Optional[List[str]]]
    supports_system_messages: Optional[bool]
    supports_response_schema: Optional[bool]
+    supports_vision: Optional[bool]


 class GenericStreamingChunk(TypedDict):
@ -166,7 +167,9 @@ class FunctionCall(OpenAIObject):

 class Function(OpenAIObject):
    arguments: str
-    name: Optional[str] = None
+    name: Optional[
+        str
+    ]  # can be None - openai e.g.: ChoiceDeltaToolCallFunction(arguments='{"', name=None), type=None)

    def __init__(
        self,
@ -280,29 +283,43 @@ class ChatCompletionMessageToolCall(OpenAIObject):
        setattr(self, key, value)


+"""
+Reference:
+ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
+"""
+
+
 class Message(OpenAIObject):
+
+    content: Optional[str]
+    role: Literal["assistant"]
+    tool_calls: Optional[List[ChatCompletionMessageToolCall]]
+    function_call: Optional[FunctionCall]
+
    def __init__(
        self,
-        content: Optional[str] = "default",
-        role="assistant",
-        logprobs=None,
+        content: Optional[str] = None,
+        role: Literal["assistant"] = "assistant",
        function_call=None,
        tool_calls=None,
        **params,
    ):
-        super(Message, self).__init__(**params)
-        self.content = content
-        self.role = role
-        if function_call is not None:
-            self.function_call = FunctionCall(**function_call)
-
-        if tool_calls is not None:
-            self.tool_calls = []
-            for tool_call in tool_calls:
-                self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
-
-        if logprobs is not None:
-            self._logprobs = ChoiceLogprobs(**logprobs)
+        init_values = {
+            "content": content,
+            "role": "assistant",
+            "function_call": (
+                FunctionCall(**function_call) if function_call is not None else None
+            ),
+            "tool_calls": (
+                [ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls]
+                if tool_calls is not None
+                else None
+            ),
+        }
+        super(Message, self).__init__(
+            **init_values,
+            **params,
+        )

    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -556,6 +573,8 @@ class ModelResponse(OpenAIObject):
                        _new_choice = choice  # type: ignore
                    elif isinstance(choice, dict):
                        _new_choice = Choices(**choice)  # type: ignore
+                    else:
+                        _new_choice = choice
                    new_choices.append(_new_choice)
                choices = new_choices
            else:
@ -608,10 +627,6 @@ class ModelResponse(OpenAIObject):
        # Allow dictionary-style access to attributes
        return getattr(self, key)

-    def __setitem__(self, key, value):
-        # Allow dictionary-style assignment of attributes
-        setattr(self, key, value)
-
    def json(self, **kwargs):
        try:
            return self.model_dump()  # noqa
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4829,6 +4829,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                supports_response_schema=_model_info.get(
                    "supports_response_schema", None
                ),
+                supports_vision=_model_info.get("supports_vision", False),
            )
    except Exception:
        raise Exception(
@ -5048,12 +5049,15 @@ def create_proxy_transport_and_mounts():
    return sync_proxy_mounts, async_proxy_mounts


-def validate_environment(model: Optional[str] = None) -> dict:
+def validate_environment(
+    model: Optional[str] = None, api_key: Optional[str] = None
+) -> dict:
    """
    Checks if the environment variables are valid for the given model.

    Args:
        model (Optional[str]): The name of the model. Defaults to None.
+        api_key (Optional[str]): If the user passed in an api key, of their own.

    Returns:
        dict: A dictionary containing the following keys:
@ -5329,6 +5333,13 @@ def validate_environment(model: Optional[str] = None) -> dict:
                keys_in_environment = True
            else:
                missing_keys.append("NLP_CLOUD_API_KEY")
+
+    if api_key is not None:
+        new_missing_keys = []
+        for key in missing_keys:
+            if "api_key" not in key.lower():
+                new_missing_keys.append(key)
+        missing_keys = new_missing_keys
    return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}


@ -8126,7 +8137,7 @@ class CustomStreamWrapper:

            if chunk.startswith(self.complete_response):
                # Remove last_sent_chunk only if it appears at the start of the new chunk
-                chunk = chunk[len(self.complete_response):]
+                chunk = chunk[len(self.complete_response) :]

            self.complete_response += chunk
            return chunk
@ -8940,7 +8951,16 @@ class CustomStreamWrapper:
            model_response.system_fingerprint = self.system_fingerprint
        model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
        model_response._hidden_params["created_at"] = time.time()
-        model_response.choices = [StreamingChoices(finish_reason=None)]
+
+        if (
+            len(model_response.choices) > 0
+            and hasattr(model_response.choices[0], "delta")
+            and model_response.choices[0].delta is not None
+        ):
+            # do nothing, if object instantiated
+            pass
+        else:
+            model_response.choices = [StreamingChoices(finish_reason=None)]
        return model_response

    def is_delta_empty(self, delta: Delta) -> bool:
@ -9483,8 +9503,8 @@ class CustomStreamWrapper:
                            model_response.choices[0].delta = Delta(**_json_delta)
                        except Exception as e:
                            verbose_logger.error(
-                                "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
-                                    str(e)
+                                "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}\n{}".format(
+                                    str(e), traceback.format_exc()
                                )
                            )
                            verbose_logger.debug(traceback.format_exc())
@ -9881,7 +9901,6 @@ class CustomStreamWrapper:
                    self.rules.post_call_rules(
                        input=self.response_uptil_now, model=self.model
                    )
-                    print_verbose(f"final returned processed chunk: {processed_chunk}")
                    self.chunks.append(processed_chunk)
                    if hasattr(
                        processed_chunk, "usage"
@ -9895,6 +9914,7 @@ class CustomStreamWrapper:

                        # Create a new object without the removed attribute
                        processed_chunk = self.model_response_creator(chunk=obj_dict)
+                    print_verbose(f"final returned processed chunk: {processed_chunk}")
                    return processed_chunk
                raise StopAsyncIteration
            else:  # temporary patch for non-aiohttp async calls
@ -10124,7 +10144,7 @@ def mock_completion_streaming_obj(
    model_response, mock_response, model, n: Optional[int] = None
 ):
    for i in range(0, len(mock_response), 3):
-        completion_obj = Delta(role="assistant", content=mock_response[i: i + 3])
+        completion_obj = Delta(role="assistant", content=mock_response[i : i + 3])
        if n is None:
            model_response.choices[0].delta = completion_obj
        else:
@ -10133,7 +10153,7 @@ def mock_completion_streaming_obj(
                _streaming_choice = litellm.utils.StreamingChoices(
                    index=j,
                    delta=litellm.utils.Delta(
-                        role="assistant", content=mock_response[i: i + 3]
+                        role="assistant", content=mock_response[i : i + 3]
                    ),
                )
                _all_choices.append(_streaming_choice)
@ -10145,7 +10165,7 @@ async def async_mock_completion_streaming_obj(
    model_response, mock_response, model, n: Optional[int] = None
 ):
    for i in range(0, len(mock_response), 3):
-        completion_obj = Delta(role="assistant", content=mock_response[i: i + 3])
+        completion_obj = Delta(role="assistant", content=mock_response[i : i + 3])
        if n is None:
            model_response.choices[0].delta = completion_obj
        else:
@ -10154,7 +10174,7 @@ async def async_mock_completion_streaming_obj(
                _streaming_choice = litellm.utils.StreamingChoices(
                    index=j,
                    delta=litellm.utils.Delta(
-                        role="assistant", content=mock_response[i: i + 3]
+                        role="assistant", content=mock_response[i : i + 3]
                    ),
                )
                _all_choices.append(_streaming_choice)
--- a/poetry.lock
+++ b/poetry.lock
@ -225,13 +225,13 @@ aio = ["aiohttp (>=3.0)"]

 [[package]]
 name = "azure-identity"
-version = "1.16.0"
+version = "1.16.1"
 description = "Microsoft Azure Identity Library for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "azure-identity-1.16.0.tar.gz", hash = "sha256:6ff1d667cdcd81da1ceab42f80a0be63ca846629f518a922f7317a7e3c844e1b"},
-    {file = "azure_identity-1.16.0-py3-none-any.whl", hash = "sha256:722fdb60b8fdd55fa44dc378b8072f4b419b56a5e54c0de391f644949f3a826f"},
+    {file = "azure-identity-1.16.1.tar.gz", hash = "sha256:6d93f04468f240d59246d8afde3091494a5040d4f141cad0f49fc0c399d0d91e"},
+    {file = "azure_identity-1.16.1-py3-none-any.whl", hash = "sha256:8fb07c25642cd4ac422559a8b50d3e77f73dcc2bbfaba419d06d6c9d7cff6726"},
 ]

 [package.dependencies]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.15"
+version = "1.41.18"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -91,10 +91,16 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.41.15"
+version = "1.41.18"
 version_files = [
    "pyproject.toml:^version"
 ]

 [tool.mypy]
 plugins = "pydantic.mypy"
+
+[tool.prisma]
+# cache engine binaries in a directory relative to your project
+# binary_cache_dir = '.binaries'
+home_dir = '.prisma'
+nodeenv_cache_dir = '.nodeenv'
--- a/Show more
+++ b/Show more