Merge branch 'main' into litellm_support_lakera_config_thresholds

2024-08-06 22:47:13 -07:00 · 2024-08-06 22:47:13 -07:00 · c82fc0cac2
commit c82fc0cac2
parent 0e222cf76b 3e84014a69
250 changed files with 17468 additions and 19307 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -48,7 +48,7 @@ jobs:
            pip install opentelemetry-sdk==1.25.0
            pip install opentelemetry-exporter-otlp==1.25.0
            pip install openai==1.34.0
-            pip install prisma   
+            pip install prisma==0.11.0   
            pip install "detect_secrets==1.5.0"         
            pip install "httpx==0.24.1"
            pip install fastapi
@ -208,6 +208,8 @@ jobs:
              -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \
              -e MISTRAL_API_KEY=$MISTRAL_API_KEY \
              -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
+              -e GROQ_API_KEY=$GROQ_API_KEY \
+              -e COHERE_API_KEY=$COHERE_API_KEY \
              -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
              -e AWS_REGION_NAME=$AWS_REGION_NAME \
              -e AUTO_INFER_REGION=True \
@ -404,7 +406,7 @@ jobs:
                circleci step halt
            fi
      - run:
-          name: Trigger Github Action for new Docker Container
+          name: Trigger Github Action for new Docker Container + Trigger Stable Release Testing
          command: |
            echo "Install TOML package."
            python3 -m pip install toml
@ -415,7 +417,8 @@ jobs:
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \
              -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}\", \"commit_hash\":\"$CIRCLE_SHA1\"}}"
-
+            echo "triggering stable release server for version ${VERSION} and commit ${CIRCLE_SHA1}"
+            curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}"
 workflows:
  version: 2
  build_and_test:
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -21,6 +21,14 @@ env:

 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
+  # print commit hash, tag, and release type
+  print:
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          echo "Commit hash: ${{ github.event.inputs.commit_hash }}"
+          echo "Tag: ${{ github.event.inputs.tag }}"
+          echo "Release type: ${{ github.event.inputs.release_type }}"
  docker-hub-deploy:
    if: github.repository == 'BerriAI/litellm'
    runs-on: ubuntu-latest
--- a/Dockerfile.custom_ui
+++ b/Dockerfile.custom_ui
@ -0,0 +1,41 @@
+# Use the provided base image
+FROM ghcr.io/berriai/litellm:litellm_fwd_server_root_path-dev
+
+# Set the working directory to /app
+WORKDIR /app
+
+# Install Node.js and npm (adjust version as needed)
+RUN apt-get update && apt-get install -y nodejs npm
+
+# Copy the UI source into the container
+COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
+
+# Set an environment variable for UI_BASE_PATH
+# This can be overridden at build time
+# set UI_BASE_PATH to "<your server root path>/ui"
+ENV UI_BASE_PATH="/prod/ui"
+
+# Build the UI with the specified UI_BASE_PATH
+WORKDIR /app/ui/litellm-dashboard
+RUN npm install
+RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
+
+# Create the destination directory
+RUN mkdir -p /app/litellm/proxy/_experimental/out
+
+# Move the built files to the appropriate location
+# Assuming the build output is in ./out directory
+RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
+    mv ./out/* /app/litellm/proxy/_experimental/out/
+
+# Switch back to the main app directory
+WORKDIR /app
+
+# Make sure your entrypoint.sh is executable
+RUN chmod +x entrypoint.sh
+
+# Expose the necessary port
+EXPOSE 4000/tcp
+
+# Override the CMD instruction with your desired command and arguments
+CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
--- a/README.md
+++ b/README.md
@ -11,7 +11,7 @@
        <p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
        <br>
    </p>
-<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
+<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">LiteLLM Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>
 <h4 align="center">
    <a href="https://pypi.org/project/litellm/" target="_blank">
        <img src="https://img.shields.io/pypi/v/litellm.svg" alt="PyPI Version">
@ -35,7 +35,7 @@ LiteLLM manages:
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)

 [**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
 [**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs)
@ -166,6 +166,10 @@ $ litellm --model huggingface/bigcode/starcoder

 ### Step 2: Make ChatCompletions Request to Proxy

+
+> [!IMPORTANT]
+> 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys)  
+
 ```python
 import openai # openai v1.0.0+
 client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url
--- a/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb
+++ b/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb
@ -0,0 +1,565 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Migrating to LiteLLM Proxy from OpenAI/Azure OpenAI\n",
+        "\n",
+        "Covers:\n",
+        "\n",
+        "*   /chat/completion\n",
+        "*   /embedding\n",
+        "\n",
+        "\n",
+        "These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.\n",
+        "\n",
+        "For more examples, [go here](https://docs.litellm.ai/docs/proxy/user_keys)\n",
+        "\n",
+        "To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)\n",
+        "\n",
+        "To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)\n"
+      ],
+      "metadata": {
+        "id": "kccfk0mHZ4Ad"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## /chat/completion\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "nmSClzCPaGH6"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### OpenAI Python SDK"
+      ],
+      "metadata": {
+        "id": "_vqcjwOVaKpO"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x1e_Ok3KZzeP"
+      },
+      "outputs": [],
+      "source": [
+        "import openai\n",
+        "client = openai.OpenAI(\n",
+        "    api_key=\"anything\",\n",
+        "    base_url=\"http://0.0.0.0:4000\"\n",
+        ")\n",
+        "\n",
+        "# request sent to model set on litellm proxy, `litellm --model`\n",
+        "response = client.chat.completions.create(\n",
+        "    model=\"gpt-3.5-turbo\",\n",
+        "    messages = [\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"this is a test request, write a short poem\"\n",
+        "        }\n",
+        "    ],\n",
+        "    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
+        "        \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
+        "            \"generation_name\": \"ishaan-generation-openai-client\",\n",
+        "            \"generation_id\": \"openai-client-gen-id22\",\n",
+        "            \"trace_id\": \"openai-client-trace-id22\",\n",
+        "            \"trace_user_id\": \"openai-client-user-id2\"\n",
+        "        }\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "print(response)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Function Calling"
+      ],
+      "metadata": {
+        "id": "AqkyKk9Scxgj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from openai import OpenAI\n",
+        "client = OpenAI(\n",
+        "    api_key=\"sk-1234\", # [OPTIONAL] set if you set one on proxy, else set \"\"\n",
+        "    base_url=\"http://0.0.0.0:4000\",\n",
+        ")\n",
+        "\n",
+        "tools = [\n",
+        "  {\n",
+        "    \"type\": \"function\",\n",
+        "    \"function\": {\n",
+        "      \"name\": \"get_current_weather\",\n",
+        "      \"description\": \"Get the current weather in a given location\",\n",
+        "      \"parameters\": {\n",
+        "        \"type\": \"object\",\n",
+        "        \"properties\": {\n",
+        "          \"location\": {\n",
+        "            \"type\": \"string\",\n",
+        "            \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
+        "          },\n",
+        "          \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
+        "        },\n",
+        "        \"required\": [\"location\"],\n",
+        "      },\n",
+        "    }\n",
+        "  }\n",
+        "]\n",
+        "messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n",
+        "completion = client.chat.completions.create(\n",
+        "  model=\"gpt-4o\", # use 'model_name' from config.yaml\n",
+        "  messages=messages,\n",
+        "  tools=tools,\n",
+        "  tool_choice=\"auto\"\n",
+        ")\n",
+        "\n",
+        "print(completion)\n"
+      ],
+      "metadata": {
+        "id": "wDg10VqLczE1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Azure OpenAI Python SDK"
+      ],
+      "metadata": {
+        "id": "YYoxLloSaNWW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import openai\n",
+        "client = openai.AzureOpenAI(\n",
+        "    api_key=\"anything\",\n",
+        "    base_url=\"http://0.0.0.0:4000\"\n",
+        ")\n",
+        "\n",
+        "# request sent to model set on litellm proxy, `litellm --model`\n",
+        "response = client.chat.completions.create(\n",
+        "    model=\"gpt-3.5-turbo\",\n",
+        "    messages = [\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"this is a test request, write a short poem\"\n",
+        "        }\n",
+        "    ],\n",
+        "    extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params\n",
+        "        \"metadata\": { # 👈 use for logging additional params (e.g. to langfuse)\n",
+        "            \"generation_name\": \"ishaan-generation-openai-client\",\n",
+        "            \"generation_id\": \"openai-client-gen-id22\",\n",
+        "            \"trace_id\": \"openai-client-trace-id22\",\n",
+        "            \"trace_user_id\": \"openai-client-user-id2\"\n",
+        "        }\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "print(response)"
+      ],
+      "metadata": {
+        "id": "yA1XcgowaSRy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Langchain Python"
+      ],
+      "metadata": {
+        "id": "yl9qhDvnaTpL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from langchain.chat_models import ChatOpenAI\n",
+        "from langchain.prompts.chat import (\n",
+        "    ChatPromptTemplate,\n",
+        "    HumanMessagePromptTemplate,\n",
+        "    SystemMessagePromptTemplate,\n",
+        ")\n",
+        "from langchain.schema import HumanMessage, SystemMessage\n",
+        "import os\n",
+        "\n",
+        "os.environ[\"OPENAI_API_KEY\"] = \"anything\"\n",
+        "\n",
+        "chat = ChatOpenAI(\n",
+        "    openai_api_base=\"http://0.0.0.0:4000\",\n",
+        "    model = \"gpt-3.5-turbo\",\n",
+        "    temperature=0.1,\n",
+        "    extra_body={\n",
+        "        \"metadata\": {\n",
+        "            \"generation_name\": \"ishaan-generation-langchain-client\",\n",
+        "            \"generation_id\": \"langchain-client-gen-id22\",\n",
+        "            \"trace_id\": \"langchain-client-trace-id22\",\n",
+        "            \"trace_user_id\": \"langchain-client-user-id2\"\n",
+        "        }\n",
+        "    }\n",
+        ")\n",
+        "\n",
+        "messages = [\n",
+        "    SystemMessage(\n",
+        "        content=\"You are a helpful assistant that im using to make a test request to.\"\n",
+        "    ),\n",
+        "    HumanMessage(\n",
+        "        content=\"test from litellm. tell me why it's amazing in 1 sentence\"\n",
+        "    ),\n",
+        "]\n",
+        "response = chat(messages)\n",
+        "\n",
+        "print(response)"
+      ],
+      "metadata": {
+        "id": "5MUZgSquaW5t"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Curl"
+      ],
+      "metadata": {
+        "id": "B9eMgnULbRaz"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        "```\n",
+        "curl -X POST 'http://0.0.0.0:4000/chat/completions' \\\n",
+        "    -H 'Content-Type: application/json' \\\n",
+        "    -d '{\n",
+        "    \"model\": \"gpt-3.5-turbo\",\n",
+        "    \"messages\": [\n",
+        "        {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": \"what llm are you\"\n",
+        "        }\n",
+        "    ],\n",
+        "    \"metadata\": {\n",
+        "        \"generation_name\": \"ishaan-test-generation\",\n",
+        "        \"generation_id\": \"gen-id22\",\n",
+        "        \"trace_id\": \"trace-id22\",\n",
+        "        \"trace_user_id\": \"user-id2\"\n",
+        "    }\n",
+        "}'\n",
+        "```\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "VWCCk5PFcmhS"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### LlamaIndex"
+      ],
+      "metadata": {
+        "id": "drBAm2e1b6xe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os, dotenv\n",
+        "\n",
+        "from llama_index.llms import AzureOpenAI\n",
+        "from llama_index.embeddings import AzureOpenAIEmbedding\n",
+        "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
+        "\n",
+        "llm = AzureOpenAI(\n",
+        "    engine=\"azure-gpt-3.5\",               # model_name on litellm proxy\n",
+        "    temperature=0.0,\n",
+        "    azure_endpoint=\"http://0.0.0.0:4000\", # litellm proxy endpoint\n",
+        "    api_key=\"sk-1234\",                    # litellm proxy API Key\n",
+        "    api_version=\"2023-07-01-preview\",\n",
+        ")\n",
+        "\n",
+        "embed_model = AzureOpenAIEmbedding(\n",
+        "    deployment_name=\"azure-embedding-model\",\n",
+        "    azure_endpoint=\"http://0.0.0.0:4000\",\n",
+        "    api_key=\"sk-1234\",\n",
+        "    api_version=\"2023-07-01-preview\",\n",
+        ")\n",
+        "\n",
+        "\n",
+        "documents = SimpleDirectoryReader(\"llama_index_data\").load_data()\n",
+        "service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)\n",
+        "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
+        "\n",
+        "query_engine = index.as_query_engine()\n",
+        "response = query_engine.query(\"What did the author do growing up?\")\n",
+        "print(response)\n"
+      ],
+      "metadata": {
+        "id": "d0bZcv8fb9mL"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Langchain JS"
+      ],
+      "metadata": {
+        "id": "xypvNdHnb-Yy"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import { ChatOpenAI } from \"@langchain/openai\";\n",
+        "\n",
+        "\n",
+        "const model = new ChatOpenAI({\n",
+        "  modelName: \"gpt-4\",\n",
+        "  openAIApiKey: \"sk-1234\",\n",
+        "  modelKwargs: {\"metadata\": \"hello world\"} // 👈 PASS Additional params here\n",
+        "}, {\n",
+        "  basePath: \"http://0.0.0.0:4000\",\n",
+        "});\n",
+        "\n",
+        "const message = await model.invoke(\"Hi there!\");\n",
+        "\n",
+        "console.log(message);\n"
+      ],
+      "metadata": {
+        "id": "R55mK2vCcBN2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### OpenAI JS"
+      ],
+      "metadata": {
+        "id": "nC4bLifCcCiW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "const { OpenAI } = require('openai');\n",
+        "\n",
+        "const openai = new OpenAI({\n",
+        "  apiKey: \"sk-1234\", // This is the default and can be omitted\n",
+        "  baseURL: \"http://0.0.0.0:4000\"\n",
+        "});\n",
+        "\n",
+        "async function main() {\n",
+        "  const chatCompletion = await openai.chat.completions.create({\n",
+        "    messages: [{ role: 'user', content: 'Say this is a test' }],\n",
+        "    model: 'gpt-3.5-turbo',\n",
+        "  }, {\"metadata\": {\n",
+        "            \"generation_name\": \"ishaan-generation-openaijs-client\",\n",
+        "            \"generation_id\": \"openaijs-client-gen-id22\",\n",
+        "            \"trace_id\": \"openaijs-client-trace-id22\",\n",
+        "            \"trace_user_id\": \"openaijs-client-user-id2\"\n",
+        "        }});\n",
+        "}\n",
+        "\n",
+        "main();\n"
+      ],
+      "metadata": {
+        "id": "MICH8kIMcFpg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Anthropic SDK"
+      ],
+      "metadata": {
+        "id": "D1Q07pEAcGTb"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "\n",
+        "from anthropic import Anthropic\n",
+        "\n",
+        "client = Anthropic(\n",
+        "    base_url=\"http://localhost:4000\", # proxy endpoint\n",
+        "    api_key=\"sk-s4xN1IiLTCytwtZFJaYQrA\", # litellm proxy virtual key\n",
+        ")\n",
+        "\n",
+        "message = client.messages.create(\n",
+        "    max_tokens=1024,\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"Hello, Claude\",\n",
+        "        }\n",
+        "    ],\n",
+        "    model=\"claude-3-opus-20240229\",\n",
+        ")\n",
+        "print(message.content)"
+      ],
+      "metadata": {
+        "id": "qBjFcAvgcI3t"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## /embeddings"
+      ],
+      "metadata": {
+        "id": "dFAR4AJGcONI"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### OpenAI Python SDK"
+      ],
+      "metadata": {
+        "id": "lgNoM281cRzR"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import openai\n",
+        "from openai import OpenAI\n",
+        "\n",
+        "# set base_url to your proxy server\n",
+        "# set api_key to send to proxy server\n",
+        "client = OpenAI(api_key=\"<proxy-api-key>\", base_url=\"http://0.0.0.0:4000\")\n",
+        "\n",
+        "response = client.embeddings.create(\n",
+        "    input=[\"hello from litellm\"],\n",
+        "    model=\"text-embedding-ada-002\"\n",
+        ")\n",
+        "\n",
+        "print(response)\n"
+      ],
+      "metadata": {
+        "id": "NY3DJhPfcQhA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Langchain Embeddings"
+      ],
+      "metadata": {
+        "id": "hmbg-DW6cUZs"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from langchain.embeddings import OpenAIEmbeddings\n",
+        "\n",
+        "embeddings = OpenAIEmbeddings(model=\"sagemaker-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
+        "\n",
+        "\n",
+        "text = \"This is a test document.\"\n",
+        "\n",
+        "query_result = embeddings.embed_query(text)\n",
+        "\n",
+        "print(f\"SAGEMAKER EMBEDDINGS\")\n",
+        "print(query_result[:5])\n",
+        "\n",
+        "embeddings = OpenAIEmbeddings(model=\"bedrock-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
+        "\n",
+        "text = \"This is a test document.\"\n",
+        "\n",
+        "query_result = embeddings.embed_query(text)\n",
+        "\n",
+        "print(f\"BEDROCK EMBEDDINGS\")\n",
+        "print(query_result[:5])\n",
+        "\n",
+        "embeddings = OpenAIEmbeddings(model=\"bedrock-titan-embeddings\", openai_api_base=\"http://0.0.0.0:4000\", openai_api_key=\"temp-key\")\n",
+        "\n",
+        "text = \"This is a test document.\"\n",
+        "\n",
+        "query_result = embeddings.embed_query(text)\n",
+        "\n",
+        "print(f\"TITAN EMBEDDINGS\")\n",
+        "print(query_result[:5])"
+      ],
+      "metadata": {
+        "id": "lX2S8Nl1cWVP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Curl Request"
+      ],
+      "metadata": {
+        "id": "oqGbWBCQcYfd"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        "```curl\n",
+        "curl -X POST 'http://0.0.0.0:4000/embeddings' \\\n",
+        "  -H 'Content-Type: application/json' \\\n",
+        "  -d ' {\n",
+        "  \"model\": \"text-embedding-ada-002\",\n",
+        "  \"input\": [\"write a litellm poem\"]\n",
+        "  }'\n",
+        "```\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "7rkIMV9LcdwQ"
+      }
+    }
+  ]
+}
--- a/cookbook/litellm_router/error_log.txt
+++ b/cookbook/litellm_router/error_log.txt
@ -1,10 +1,10 @@
-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -21,13 +21,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -49,7 +49,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -61,7 +61,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -70,7 +70,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -79,7 +79,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -109,7 +109,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -128,7 +128,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -148,7 +148,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -162,7 +162,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -174,7 +174,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -184,7 +184,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -193,19 +193,19 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -214,7 +214,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -234,7 +234,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -244,7 +244,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -253,7 +253,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -267,31 +267,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -305,7 +305,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -330,7 +330,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -339,7 +339,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -360,7 +360,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -369,7 +369,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -378,7 +378,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -388,7 +388,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -409,7 +409,7 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -422,13 +422,13 @@ Exception: Expecting value: line 1 column 1 (char 0)
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -438,7 +438,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: Expecting value: line 1 column 1 (char 0)

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -462,7 +462,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -482,7 +482,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -492,7 +492,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -516,7 +516,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -529,7 +529,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -546,13 +546,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -580,13 +580,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -624,7 +624,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -638,13 +638,13 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -660,7 +660,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -681,7 +681,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -691,31 +691,31 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -771,7 +771,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -780,7 +780,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -800,7 +800,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -820,7 +820,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -830,7 +830,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -840,7 +840,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -850,7 +850,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -862,13 +862,13 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -877,7 +877,7 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -898,7 +898,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -919,7 +919,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -936,19 +936,19 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -961,25 +961,25 @@ Exception: 'Response' object has no attribute 'get'
 Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. Call all LLM APIs using the Ope
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -993,7 +993,7 @@ Question: Given this context, what is litellm? LiteLLM about: About
 Call all LLM APIs using the OpenAI format.
 Exception: 'Response' object has no attribute 'get'

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
--- a/cookbook/litellm_router/request_log.txt
+++ b/cookbook/litellm_router/request_log.txt
@ -20,7 +20,7 @@ Call all LLM APIs using the OpenAI format.
 Response ID: 52dbbd49-eedb-4c11-8382-3ca7deb1af35 Url: /queue/response/52dbbd49-eedb-4c11-8382-3ca7deb1af35
 Time: 3.50 seconds

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
@ -35,7 +35,7 @@ Question: Does litellm support ooobagooba llms? how can i call oobagooba llms. C
 Response ID: ae1e2b71-d711-456d-8df0-13ce0709eb04 Url: /queue/response/ae1e2b71-d711-456d-8df0-13ce0709eb04
 Time: 5.60 seconds

-Question: What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+Question: What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 10
--- a/cookbook/litellm_router/test_questions/question3.txt
+++ b/cookbook/litellm_router/test_questions/question3.txt
@ -1,4 +1,4 @@
-What endpoints does the litellm proxy have 💥 OpenAI Proxy Server
+What endpoints does the litellm proxy have 💥 LiteLLM Proxy Server
 LiteLLM Server manages:

 Calling 100+ LLMs Huggingface/Bedrock/TogetherAI/etc. in the OpenAI ChatCompletions & Completions format
--- a/deploy/charts/litellm-helm/Chart.yaml
+++ b/deploy/charts/litellm-helm/Chart.yaml
@ -18,13 +18,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.1
+version: 0.2.2

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: v1.41.8
+appVersion: v1.42.7

 dependencies:
  - name: "postgresql"
--- a/deploy/charts/litellm-helm/README.md
+++ b/deploy/charts/litellm-helm/README.md
@ -1,5 +1,9 @@
 # Helm Chart for LiteLLM

+> [!IMPORTANT]
+> This is community maintained, Please make an issue if you run into a bug
+> We recommend using [Docker or Kubernetes for production deployments](https://docs.litellm.ai/docs/proxy/prod)
+
 ## Prerequisites

 - Kubernetes 1.21+
--- a/docs/my-website/docs/batches.md
+++ b/docs/my-website/docs/batches.md
@ -1,23 +1,73 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Batches API
+# [BETA] Batches API

 Covers Batches, Files


 ## Quick Start 

-Call an existing Assistant. 
-
 - Create File for Batch Completion

 - Create Batch Request

+- List Batches
+
 - Retrieve the Specific Batch and File Content


 <Tabs>
+<TabItem value="proxy" label="LiteLLM PROXY Server">
+
+```bash
+$ export OPENAI_API_KEY="sk-..."
+
+$ litellm
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+**Create File for Batch Completion**
+
+```shell
+curl http://localhost:4000/v1/files \
+    -H "Authorization: Bearer sk-1234" \
+    -F purpose="batch" \
+    -F file="@mydata.jsonl"
+```
+
+**Create Batch Request**
+
+```bash
+curl http://localhost:4000/v1/batches \
+        -H "Authorization: Bearer sk-1234" \
+        -H "Content-Type: application/json" \
+        -d '{
+            "input_file_id": "file-abc123",
+            "endpoint": "/v1/chat/completions",
+            "completion_window": "24h"
+    }'
+```
+
+**Retrieve the Specific Batch**
+
+```bash
+curl http://localhost:4000/v1/batches/batch_abc123 \
+    -H "Authorization: Bearer sk-1234" \
+    -H "Content-Type: application/json" \
+```
+
+
+**List Batches**
+
+```bash
+curl http://localhost:4000/v1/batches \
+    -H "Authorization: Bearer sk-1234" \
+    -H "Content-Type: application/json" \
+```
+
+</TabItem>
 <TabItem value="sdk" label="SDK">

 **Create File for Batch Completion**
@ -77,48 +127,15 @@ file_content = await litellm.afile_content(
 print("file content = ", file_content)
 ```

-</TabItem>
-<TabItem value="proxy" label="PROXY">
+**List Batches**

-```bash
-$ export OPENAI_API_KEY="sk-..."
-
-$ litellm
-
-# RUNNING on http://0.0.0.0:4000
-```
-
-**Create File for Batch Completion**
-
-```shell
-curl https://api.openai.com/v1/files \
-    -H "Authorization: Bearer sk-1234" \
-    -F purpose="batch" \
-    -F file="@mydata.jsonl"
-```
-
-**Create Batch Request**
-
-```bash
-curl http://localhost:4000/v1/batches \
-        -H "Authorization: Bearer sk-1234" \
-        -H "Content-Type: application/json" \
-        -d '{
-            "input_file_id": "file-abc123",
-            "endpoint": "/v1/chat/completions",
-            "completion_window": "24h"
-    }'
-```
-
-**Retrieve the Specific Batch**
-
-```bash
-curl http://localhost:4000/v1/batches/batch_abc123 \
-    -H "Authorization: Bearer sk-1234" \
-    -H "Content-Type: application/json" \
+```python
+list_batches_response = litellm.list_batches(custom_llm_provider="openai", limit=2)
+print("list_batches_response=", list_batches_response)
 ```

 </TabItem>
+
 </Tabs>

 ## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch)
--- a/docs/my-website/docs/budget_manager.md
+++ b/docs/my-website/docs/budget_manager.md
@ -7,14 +7,14 @@ Don't want to get crazy bills because either while you're calling LLM APIs **or*

 :::info

-If you want a server to manage user keys, budgets, etc. use our [OpenAI Proxy Server](./proxy/virtual_keys.md)
+If you want a server to manage user keys, budgets, etc. use our [LiteLLM Proxy Server](./proxy/virtual_keys.md)

 :::

 LiteLLM exposes: 
 * `litellm.max_budget`: a global variable you can use to set the max budget (in USD) across all your litellm calls. If this budget is exceeded, it will raise a BudgetExceededError 
 * `BudgetManager`: A class to help set budgets per user. BudgetManager creates a dictionary to manage the user budgets, where the key is user and the object is their current cost + model-specific costs. 
-* `OpenAI Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc. 
+* `LiteLLM Proxy Server`: A server to call 100+ LLMs with an openai-compatible endpoint. Manages user budgets, spend tracking, load balancing etc. 

 ## quick start

--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -48,19 +48,20 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea
 |Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ |
 |Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |   |   |
 |Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |   |    |
-|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | |
+|Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ | | | |
 |AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |
 |VertexAI| ✅ | ✅ |  | ✅ | ✅ |  |  |  |  |   | | | | ✅ | ✅ | | |
-|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | | | | | ✅ (for anthropic) | |
+|Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | | | | | ✅ (model dependent) | |
 |Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |
 |TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |  |   | ✅ |
 |AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |   |  |   |
 |Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |   |
 |NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | |  |  |   |
 |Petals| ✅ | ✅ |  | ✅ | ✅ | |  |   |  |   |
-|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | ✅ |  |   | | | ✅ | | |
+|Ollama| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | ✅ |  |   | | | ✅ |  | |✅| | | | | | |
 |Databricks| ✅ | ✅ | ✅ | ✅ | ✅ |  |   | |  |   | | | | | |
 |ClarifAI| ✅ | ✅ | |✅ | ✅ |  |   | |  |   | | | | | |
+|Github| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ |✅ (model dependent)|✅ (model dependent)| | |
 :::note

 By default, LiteLLM raises an exception if the openai param being passed in isn't supported. 
--- a/docs/my-website/docs/completion/json_mode.md
+++ b/docs/my-website/docs/completion/json_mode.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# JSON Mode
+# Structured Outputs (JSON Mode)

 ## Quick Start 

@ -61,8 +61,180 @@ params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_prov
 assert "response_format" in params
 ```

+## Pass in 'json_schema' 
+
+To use Structured Outputs, simply specify
+
+```
+response_format: { "type": "json_schema", "json_schema": … , "strict": true }
+```
+
+Works for OpenAI models 
+
+:::info
+
+Support for passing in a pydantic object to litellm sdk will be [coming soon](https://github.com/BerriAI/litellm/issues/5074#issuecomment-2272355842)
+
+:::
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import os
+from litellm import completion 
+
+# add to env var 
+os.environ["OPENAI_API_KEY"] = ""
+
+messages = [{"role": "user", "content": "List 5 cookie recipes"}]
+
+resp = completion(
+    model="gpt-4o-2024-08-06",
+    messages=messages,
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+          "name": "math_reasoning",
+          "schema": {
+            "type": "object",
+            "properties": {
+              "steps": {
+                "type": "array",
+                "items": {
+                  "type": "object",
+                  "properties": {
+                    "explanation": { "type": "string" },
+                    "output": { "type": "string" }
+                  },
+                  "required": ["explanation", "output"],
+                  "additionalProperties": False
+                }
+              },
+              "final_answer": { "type": "string" }
+            },
+            "required": ["steps", "final_answer"],
+            "additionalProperties": False
+          },
+          "strict": True
+        },
+    }
+)
+
+print("Received={}".format(resp))
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add openai model to config.yaml
+
+```yaml
+model_list:
+  - model_name: "gpt-4o"
+    litellm_params:
+      model: "gpt-4o-2024-08-06"
+```
+
+2. Start proxy with config.yaml
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Call with OpenAI SDK / Curl!
+
+Just replace the 'base_url' in the openai sdk, to call the proxy with 'json_schema' for openai models
+
+**OpenAI SDK**
+```python
+from pydantic import BaseModel
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set)
+    base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL
+)
+
+class Step(BaseModel):
+    explanation: str
+    output: str
+
+class MathReasoning(BaseModel):
+    steps: list[Step]
+    final_answer: str
+
+completion = client.beta.chat.completions.parse(
+    model="gpt-4o",
+    messages=[
+        {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
+        {"role": "user", "content": "how can I solve 8x + 7 = -23"}
+    ],
+    response_format=MathReasoning,
+)
+
+math_reasoning = completion.choices[0].message.parsed
+```
+
+**Curl**
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "gpt-4o",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful math tutor. Guide the user through the solution step by step."
+      },
+      {
+        "role": "user",
+        "content": "how can I solve 8x + 7 = -23"
+      }
+    ],
+    "response_format": {
+      "type": "json_schema",
+      "json_schema": {
+        "name": "math_reasoning",
+        "schema": {
+          "type": "object",
+          "properties": {
+            "steps": {
+              "type": "array",
+              "items": {
+                "type": "object",
+                "properties": {
+                  "explanation": { "type": "string" },
+                  "output": { "type": "string" }
+                },
+                "required": ["explanation", "output"],
+                "additionalProperties": false
+              }
+            },
+            "final_answer": { "type": "string" }
+          },
+          "required": ["steps", "final_answer"],
+          "additionalProperties": false
+        },
+        "strict": true
+      }
+    }
+  }'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## Validate JSON Schema 

+:::info
+
+Support for doing this in the openai 'json_schema' format will be [coming soon](https://github.com/BerriAI/litellm/issues/5074#issuecomment-2272355842)
+
+:::
+
 For VertexAI models, LiteLLM supports passing the `response_schema` and validating the JSON output.

 This works across Gemini (`vertex_ai_beta/`) + Anthropic (`vertex_ai/`) models. 
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -270,7 +270,7 @@ response = embedding(
 | embed-multilingual-v2.0  | `embedding(model="embed-multilingual-v2.0", input=["good morning from litellm", "this is another item"])` |

 ## HuggingFace Embedding Models
-LiteLLM supports all Feature-Extraction Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction
+LiteLLM supports all Feature-Extraction + Sentence Similarity Embedding models: https://huggingface.co/models?pipeline_tag=feature-extraction

 ### Usage
 ```python
@ -282,6 +282,25 @@ response = embedding(
    input=["good morning from litellm"]
 )
 ```
+
+### Usage - Set input_type
+
+LiteLLM infers input type (feature-extraction or sentence-similarity) by making a GET request to the api base. 
+
+Override this, by setting the `input_type` yourself.
+
+```python
+from litellm import embedding
+import os
+os.environ['HUGGINGFACE_API_KEY'] = ""
+response = embedding(
+    model='huggingface/microsoft/codebert-base', 
+    input=["good morning from litellm", "you are a good bot"],
+    api_base = "https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud", 
+    input_type="sentence-similarity"
+)
+```
+
 ### Usage - Custom API Base
 ```python
 from litellm import embedding
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@ -29,8 +29,12 @@ This covers:
        - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
        - ✅ Set Max Request / File Size on Requests
        - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)
-    - **Spend Tracking**
+    - **Customize Logging, Guardrails, Caching per project**
+        - ✅ [Team Based Logging](./proxy/team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
+        - ✅ [Disable Logging for a Team](./proxy/team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
+    - **Spend Tracking & Data Exports**
        - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
+        - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
        - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
    - **Advanced Metrics**
        - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
--- a/docs/my-website/docs/fine_tuning.md
+++ b/docs/my-website/docs/fine_tuning.md
@ -0,0 +1,313 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# [Beta] Fine-tuning API
+
+
+:::info
+
+This is an Enterprise only endpoint [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+## Supported Providers
+- Azure OpenAI
+- OpenAI
+- Vertex AI
+
+Add `finetune_settings` and `files_settings` to your litellm config.yaml to use the fine-tuning endpoints.
+## Example config.yaml for `finetune_settings` and `files_settings`
+```yaml
+model_list:
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+
+# For /fine_tuning/jobs endpoints
+finetune_settings:
+  - custom_llm_provider: azure
+    api_base: https://exampleopenaiendpoint-production.up.railway.app
+    api_key: os.environ/AZURE_API_KEY
+    api_version: "2023-03-15-preview"
+  - custom_llm_provider: openai
+    api_key: os.environ/OPENAI_API_KEY
+  - custom_llm_provider: "vertex_ai"
+    vertex_project: "adroit-crow-413218"
+    vertex_location: "us-central1"
+    vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json"
+
+# for /files endpoints
+files_settings:
+  - custom_llm_provider: azure
+    api_base: https://exampleopenaiendpoint-production.up.railway.app
+    api_key: fake-key
+    api_version: "2023-03-15-preview"
+  - custom_llm_provider: openai
+    api_key: os.environ/OPENAI_API_KEY
+```
+
+## Create File for fine-tuning
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") # base_url is your litellm proxy url
+
+file_name = "openai_batch_completions.jsonl"
+response = await client.files.create(
+    extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
+    file=open(file_name, "rb"),
+    purpose="fine-tune",
+)
+```
+</TabItem>
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/files \
+    -H "Authorization: Bearer sk-1234" \
+    -F purpose="batch" \
+    -F custom_llm_provider="azure"\
+    -F file="@mydata.jsonl"
+```
+</TabItem>
+</Tabs>
+
+## Create fine-tuning job
+
+<Tabs>
+<TabItem value="azure" label="Azure OpenAI">
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+ft_job = await client.fine_tuning.jobs.create(
+    model="gpt-35-turbo-1106",                   # Azure OpenAI model you want to fine-tune
+    training_file="file-abc123",                 # file_id from create file response
+    extra_body={"custom_llm_provider": "azure"}, # tell litellm proxy which provider to use
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl http://localhost:4000/v1/fine_tuning/jobs \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer sk-1234" \
+    -d '{
+    "custom_llm_provider": "azure",
+    "model": "gpt-35-turbo-1106",
+    "training_file": "file-abc123"
+    }'
+```
+</TabItem>
+</Tabs>
+
+</TabItem>
+
+<TabItem value="Vertex" label="VertexAI">
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+ft_job = await client.fine_tuning.jobs.create(
+    model="gemini-1.0-pro-002",                  # Vertex model you want to fine-tune
+    training_file="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl",                 # file_id from create file response
+    extra_body={"custom_llm_provider": "vertex_ai"}, # tell litellm proxy which provider to use
+)
+```
+</TabItem>
+
+<TabItem value="curl" label="curl (Unified API)">
+
+```shell
+curl http://localhost:4000/v1/fine_tuning/jobs \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer sk-1234" \
+    -d '{
+    "custom_llm_provider": "vertex_ai",
+    "model": "gemini-1.0-pro-002",
+    "training_file": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
+    }'
+```
+</TabItem>
+
+<TabItem value="curl-vtx" label="curl (VertexAI API)">
+
+:::info
+
+Use this to create Fine tuning Jobs in [the Vertex AI API Format](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning#create-tuning)
+
+:::
+
+```shell
+curl http://localhost:4000/v1/projects/tuningJobs \
+      -H "Content-Type: application/json" \
+      -H "Authorization: Bearer sk-1234" \
+      -d '{
+  "baseModel": "gemini-1.0-pro-002",
+  "supervisedTuningSpec" : {
+      "training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
+  }
+}'
+```
+
+</TabItem>
+</Tabs>
+
+</TabItem>
+</Tabs>
+
+### Request Body
+
+<Tabs>
+<TabItem value="params" label="Supported Params">
+
+* `model`
+
+    **Type:** string  
+    **Required:** Yes  
+    The name of the model to fine-tune
+
+* `custom_llm_provider`
+
+    **Type:** `Literal["azure", "openai", "vertex_ai"]`
+
+    **Required:** Yes
+    The name of the model to fine-tune. You can select one of the [**supported providers**](#supported-providers)
+
+* `training_file`
+
+    **Type:** string  
+    **Required:** Yes  
+    The ID of an uploaded file that contains training data.
+    - See **upload file** for how to upload a file.
+    - Your dataset must be formatted as a JSONL file.
+
+* `hyperparameters`
+
+    **Type:** object  
+    **Required:** No  
+    The hyperparameters used for the fine-tuning job.
+    > #### Supported `hyperparameters`
+    > #### batch_size
+    **Type:** string or integer  
+    **Required:** No  
+    Number of examples in each batch. A larger batch size means that model parameters are updated less frequently, but with lower variance.
+    > #### learning_rate_multiplier
+    **Type:** string or number  
+    **Required:** No  
+    Scaling factor for the learning rate. A smaller learning rate may be useful to avoid overfitting.
+
+    > #### n_epochs
+    **Type:** string or integer  
+    **Required:** No  
+    The number of epochs to train the model for. An epoch refers to one full cycle through the training dataset.
+
+* `suffix`
+    **Type:** string or null  
+    **Required:** No  
+    **Default:** null  
+    A string of up to 18 characters that will be added to your fine-tuned model name.
+    Example: A `suffix` of "custom-model-name" would produce a model name like `ft:gpt-4o-mini:openai:custom-model-name:7p4lURel`.
+
+* `validation_file`
+    **Type:** string or null  
+    **Required:** No  
+    The ID of an uploaded file that contains validation data.
+    - If provided, this data is used to generate validation metrics periodically during fine-tuning.
+
+
+* `integrations`
+    **Type:** array or null  
+    **Required:** No  
+    A list of integrations to enable for your fine-tuning job.
+
+* `seed`
+    **Type:** integer or null  
+    **Required:** No  
+    The seed controls the reproducibility of the job. Passing in the same seed and job parameters should produce the same results, but may differ in rare cases. If a seed is not specified, one will be generated for you.
+
+</TabItem>
+<TabItem value="example" label="Example Request Body">
+
+```json
+{
+  "model": "gpt-4o-mini",
+  "training_file": "file-abcde12345",
+  "hyperparameters": {
+    "batch_size": 4,
+    "learning_rate_multiplier": 0.1,
+    "n_epochs": 3
+  },
+  "suffix": "custom-model-v1",
+  "validation_file": "file-fghij67890",
+  "seed": 42
+}
+```
+</TabItem>
+</Tabs>
+
+## Cancel fine-tuning job
+
+<Tabs>
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+# cancel specific fine tuning job
+cancel_ft_job = await client.fine_tuning.jobs.cancel(
+    fine_tuning_job_id="123",                          # fine tuning job id
+    extra_body={"custom_llm_provider": "azure"},       # tell litellm proxy which provider to use
+)
+
+print("response from cancel ft job={}".format(cancel_ft_job))
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl -X POST http://localhost:4000/v1/fine_tuning/jobs/ftjob-abc123/cancel \
+  -H "Authorization: Bearer sk-1234" \
+  -H "Content-Type: application/json" \
+  -d '{"custom_llm_provider": "azure"}'
+```
+</TabItem>
+
+</Tabs>
+
+## List fine-tuning jobs
+
+<Tabs>
+
+<TabItem value="openai" label="OpenAI Python SDK">
+
+```python
+list_ft_jobs = await client.fine_tuning.jobs.list(
+    extra_query={"custom_llm_provider": "azure"}   # tell litellm proxy which provider to use
+)
+
+print("list of ft jobs={}".format(list_ft_jobs))
+```
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```shell
+curl -X GET 'http://localhost:4000/v1/fine_tuning/jobs?custom_llm_provider=azure' \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer sk-1234"
+```
+</TabItem>
+
+</Tabs>
+
+
+
+## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/fine-tuning)
--- a/docs/my-website/docs/index.md
+++ b/docs/my-website/docs/index.md
@ -10,14 +10,40 @@ https://github.com/BerriAI/litellm
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+- Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)

 ## How to use LiteLLM
 You can use litellm through either:
-1. [OpenAI proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
+1. [LiteLLM Proxy Server](#openai-proxy) - Server to call 100+ LLMs, load balance, cost tracking across projects
 2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking

-## LiteLLM Python SDK
+### When to use LiteLLM Proxy Server
+
+:::tip
+
+Use LiteLLM Proxy Server if you want a **central service to access multiple LLMs**
+
+Typically used by Gen AI Enablement /  ML PLatform Teams
+
+:::
+
+  - LiteLLM Proxy gives you a unified interface to access multiple LLMs (100+ LLMs)
+  - Track LLM Usage and setup guardrails
+  - Customize Logging, Guardrails, Caching per project
+
+### When to use LiteLLM Python SDK
+
+:::tip
+
+  Use LiteLLM Python SDK if you want to use LiteLLM in your **python code**
+
+Typically used by developers building llm projects
+
+:::
+
+  - LiteLLM SDK gives you a unified interface to access multiple LLMs (100+ LLMs) 
+  - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+

 ### Basic usage 

--- a/docs/my-website/docs/observability/gcs_bucket_integration.md
+++ b/docs/my-website/docs/observability/gcs_bucket_integration.md
@ -0,0 +1,127 @@
+import Image from '@theme/IdealImage';
+
+# 🪣 Google Cloud Storage Buckets - Logging LLM Input/Output
+
+Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
+
+:::info
+
+✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+
+### Usage
+
+1. Add `gcs_bucket` to LiteLLM Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
+```
+
+2. Set required env variables
+
+```shell
+GCS_BUCKET_NAME="<your-gcs-bucket-name>"
+GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
+```
+
+3. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+4. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+
+## Expected Logs on GCS Buckets
+
+<Image img={require('../../img/gcs_bucket.png')} />
+
+### Fields Logged on GCS Buckets
+
+Example payload of a `/chat/completion` request logged on GCS
+```json
+{
+  "request_kwargs": {
+    "model": "gpt-3.5-turbo",
+    "messages": [
+      {
+        "role": "user",
+        "content": "This is a test"
+      }
+    ],
+    "optional_params": {
+      "temperature": 0.7,
+      "max_tokens": 10,
+      "user": "ishaan-2",
+      "extra_body": {}
+    }
+  },
+  "response_obj": {
+    "id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
+    "choices": [
+      {
+        "finish_reason": "stop",
+        "index": 0,
+        "message": {
+          "content": "Hi!",
+          "role": "assistant",
+          "tool_calls": null,
+          "function_call": null
+        }
+      }
+    ],
+    "created": 1722868456,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "prompt_tokens": 10,
+      "completion_tokens": 20,
+      "total_tokens": 30
+    }
+  },
+  "start_time": "2024-08-05 07:34:16",
+  "end_time": "2024-08-05 07:34:16"
+}
+```
+
+## Getting `service_account.json` from Google Cloud Console
+
+1. Go to [Google Cloud Console](https://console.cloud.google.com/)
+2. Search for IAM & Admin
+3. Click on Service Accounts
+4. Select a Service Account
+5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
+6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
+
+## Support & Talk to Founders
+
+- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
+- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
+- Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬
+- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -82,6 +82,47 @@ model_list:
 ```bash
 litellm --config /path/to/config.yaml
 ```
+</TabItem>
+<TabItem value="config-all" label="config - default all Anthropic Model">
+
+Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
+
+#### Required env variables
+```
+ANTHROPIC_API_KEY=sk-ant****
+```
+
+```yaml
+model_list:
+  - model_name: "*" 
+    litellm_params:
+      model: "*"
+```
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+Example Request for this config.yaml
+
+**Ensure you use `anthropic/` prefix to route the request to Anthropic API**
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "anthropic/claude-3-haiku-20240307",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+
+
 </TabItem>
 <TabItem value="cli" label="cli">

--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -66,8 +66,15 @@ response = litellm.completion(

 ## Azure OpenAI Chat Completion Models

+:::tip
+
+**We support ALL Azure models, just set `model=azure/<your deployment name>` as a prefix when sending litellm requests**
+
+:::
+
 | Model Name       | Function Call                          |
 |------------------|----------------------------------------|
+| gpt-4o-mini            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4o            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4-0314            | `completion('azure/<your deployment name>', messages)`         | 
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -360,6 +360,71 @@ resp = litellm.completion(
 print(f"\nResponse: {resp}")
 ```

+
+## Usage - Bedrock Guardrails
+
+Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html)
+
+<Tabs>
+<TabItem value="sdk" label="LiteLLM SDK">
+
+```python
+from litellm import completion
+
+# set env
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+response = completion(
+    model="anthropic.claude-v2",
+    messages=[
+        {
+            "content": "where do i buy coffee from? ",
+            "role": "user",
+        }
+    ],
+    max_tokens=10,
+    guardrailConfig={
+        "guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
+        "guardrailVersion": "DRAFT",           # The version of the guardrail.
+        "trace": "disabled",                   # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
+    },
+)
+```
+</TabItem>
+<TabItem value="proxy" label="LiteLLM Proxy Server">
+
+```python
+
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="anthropic.claude-v2", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+],
+temperature=0.7,
+extra_body={
+    "guardrailConfig": {
+        "guardrailIdentifier": "ff6ujrregl1q", # The identifier (ID) for the guardrail.
+        "guardrailVersion": "DRAFT",           # The version of the guardrail.
+        "trace": "disabled",                   # The trace behavior for the guardrail. Can either be "disabled" or "enabled"
+    },
+}
+)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
 ## Usage - "Assistant Pre-fill"

 If you're using Anthropic's Claude with Bedrock, you can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
--- a/docs/my-website/docs/providers/custom_llm_server.md
+++ b/docs/my-website/docs/providers/custom_llm_server.md
@ -1,7 +1,6 @@
 # Custom API Server (Custom Format)

-LiteLLM allows you to call your custom endpoint in the OpenAI ChatCompletion format
-
+Call your custom torch-serve / internal LLM APIs via LiteLLM

 :::info

--- a/docs/my-website/docs/providers/databricks.md
+++ b/docs/my-website/docs/providers/databricks.md
@ -5,6 +5,11 @@ import TabItem from '@theme/TabItem';

 LiteLLM supports all models on Databricks

+:::tip
+
+**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
+
+:::

 ## Usage

@ -185,8 +190,17 @@ response = litellm.embedding(

 ## Supported Databricks Chat Completion Models 

+:::tip
+
+**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
+
+:::
+
+
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
+| databricks-meta-llama-3-1-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)`   | 
+| databricks-meta-llama-3-1-405b-instruct    | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)`   | 
 | databricks-dbrx-instruct    | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)`   | 
 | databricks-meta-llama-3-70b-instruct    | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)`   | 
 | databricks-llama-2-70b-chat    | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)`   | 
@ -196,6 +210,13 @@ response = litellm.embedding(

 ## Supported Databricks Embedding Models 

+:::tip
+
+**We support ALL Databricks models, just set `model=databricks/<any-model-on-databricks>` as a prefix when sending litellm requests**
+
+:::
+
+
 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
 | databricks-bge-large-en    | `embedding(model='databricks/databricks-bge-large-en', messages=messages)`   |
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -1,3 +1,7 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Gemini - Google AI Studio

 ## Pre-requisites
@ -17,6 +21,335 @@ response = completion(
 )
 ```

+## Supported OpenAI Params
+- temperature
+- top_p
+- max_tokens
+- stream
+- tools
+- tool_choice
+- response_format
+- n
+- stop
+
+[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122)
+
+## Passing Gemini Specific Params
+### Response schema 
+LiteLLM supports sending `response_schema` as a param for Gemini-1.5-Pro on Google AI Studio. 
+
+**Response Schema**
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion 
+import json 
+import os 
+
+os.environ['GEMINI_API_KEY'] = ""
+
+messages = [
+    {
+        "role": "user",
+        "content": "List 5 popular cookie recipes."
+    }
+]
+
+response_schema = {
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {
+                "recipe_name": {
+                    "type": "string",
+                },
+            },
+            "required": ["recipe_name"],
+        },
+    }
+
+
+completion(
+    model="gemini/gemini-1.5-pro", 
+    messages=messages, 
+    response_format={"type": "json_object", "response_schema": response_schema} # 👈 KEY CHANGE
+    )
+
+print(json.loads(completion.choices[0].message.content))
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: gemini-pro
+    litellm_params:
+      model: gemini/gemini-1.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+  "model": "gemini-pro",
+  "messages": [
+        {"role": "user", "content": "List 5 popular cookie recipes."}
+    ],
+  "response_format": {"type": "json_object", "response_schema": { 
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {
+                "recipe_name": {
+                    "type": "string",
+                },
+            },
+            "required": ["recipe_name"],
+        },
+    }}
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+**Validate Schema**
+
+To validate the response_schema, set `enforce_validation: true`.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion, JSONSchemaValidationError
+try: 
+	completion(
+    model="gemini/gemini-1.5-pro", 
+    messages=messages, 
+    response_format={
+        "type": "json_object", 
+        "response_schema": response_schema,
+        "enforce_validation": true # 👈 KEY CHANGE
+    }
+	)
+except JSONSchemaValidationError as e: 
+	print("Raw Response: {}".format(e.raw_response))
+	raise e
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: gemini-pro
+    litellm_params:
+      model: gemini/gemini-1.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+  "model": "gemini-pro",
+  "messages": [
+        {"role": "user", "content": "List 5 popular cookie recipes."}
+    ],
+  "response_format": {"type": "json_object", "response_schema": { 
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {
+                "recipe_name": {
+                    "type": "string",
+                },
+            },
+            "required": ["recipe_name"],
+        },
+    }, 
+    "enforce_validation": true
+    }
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+LiteLLM will validate the response against the schema, and raise a `JSONSchemaValidationError` if the response does not match the schema. 
+
+JSONSchemaValidationError inherits from `openai.APIError` 
+
+Access the raw response with `e.raw_response`
+
+
+
+### GenerationConfig Params 
+
+To pass additional GenerationConfig params - e.g. `topK`, just pass it in the request body of the call, and LiteLLM will pass it straight through as a key-value pair in the request body. 
+
+[**See Gemini GenerationConfigParams**](https://ai.google.dev/api/generate-content#v1beta.GenerationConfig)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion 
+import json 
+import os 
+
+os.environ['GEMINI_API_KEY'] = ""
+
+messages = [
+    {
+        "role": "user",
+        "content": "List 5 popular cookie recipes."
+    }
+]
+
+completion(
+    model="gemini/gemini-1.5-pro", 
+    messages=messages, 
+    topK=1 # 👈 KEY CHANGE
+)
+
+print(json.loads(completion.choices[0].message.content))
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: gemini-pro
+    litellm_params:
+      model: gemini/gemini-1.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-pro",
+  "messages": [
+        {"role": "user", "content": "List 5 popular cookie recipes."}
+    ],
+  "topK": 1 # 👈 KEY CHANGE
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
+**Validate Schema**
+
+To validate the response_schema, set `enforce_validation: true`.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion, JSONSchemaValidationError
+try: 
+	completion(
+    model="gemini/gemini-1.5-pro", 
+    messages=messages, 
+    response_format={
+        "type": "json_object", 
+        "response_schema": response_schema,
+        "enforce_validation": true # 👈 KEY CHANGE
+    }
+	)
+except JSONSchemaValidationError as e: 
+	print("Raw Response: {}".format(e.raw_response))
+	raise e
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: gemini-pro
+    litellm_params:
+      model: gemini/gemini-1.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-D '{
+  "model": "gemini-pro",
+  "messages": [
+        {"role": "user", "content": "List 5 popular cookie recipes."}
+    ],
+  "response_format": {"type": "json_object", "response_schema": { 
+        "type": "array",
+        "items": {
+            "type": "object",
+            "properties": {
+                "recipe_name": {
+                    "type": "string",
+                },
+            },
+            "required": ["recipe_name"],
+        },
+    }, 
+    "enforce_validation": true
+    }
+}
+'
+```
+
+</TabItem>
+</Tabs>
+
 ## Specifying Safety Settings 
 In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example:

@ -91,6 +424,72 @@ assert isinstance(
 ```


+## JSON Mode
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion 
+import json 
+import os 
+
+os.environ['GEMINI_API_KEY'] = ""
+
+messages = [
+    {
+        "role": "user",
+        "content": "List 5 popular cookie recipes."
+    }
+]
+
+
+
+completion(
+    model="gemini/gemini-1.5-pro", 
+    messages=messages, 
+    response_format={"type": "json_object"} # 👈 KEY CHANGE
+)
+
+print(json.loads(completion.choices[0].message.content))
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add model to config.yaml
+```yaml
+model_list:
+  - model_name: gemini-pro
+    litellm_params:
+      model: gemini/gemini-1.5-pro
+      api_key: os.environ/GEMINI_API_KEY
+```
+
+2. Start Proxy 
+
+```
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make Request!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gemini-pro",
+  "messages": [
+        {"role": "user", "content": "List 5 popular cookie recipes."}
+    ],
+  "response_format": {"type": "json_object"}
+}
+'
+```
+
+</TabItem>
+</Tabs>
 # Gemini-Pro-Vision
 LiteLLM Supports the following image types passed in `url`
 - Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
@ -141,8 +540,13 @@ print(content)
 ```

 ## Chat Models
+:::tip
+
+**We support ALL Gemini models, just set `model=gemini/<any-model-on-gemini>` as a prefix when sending litellm requests**
+
+:::
 | Model Name            | Function Call                                          | Required OS Variables          |
 |-----------------------|--------------------------------------------------------|--------------------------------|
-| gemini-pro            | `completion('gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
-| gemini-1.5-pro-latest | `completion('gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-pro-vision     | `completion('gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-pro            | `completion(model='gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
+| gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
+| gemini-pro-vision     | `completion(model='gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
--- a/docs/my-website/docs/providers/github.md
+++ b/docs/my-website/docs/providers/github.md
@ -0,0 +1,261 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 🆕 Github
+https://github.com/marketplace/models
+
+:::tip
+
+**We support ALL Github models, just set `model=github/<any-model-on-github>` as a prefix when sending litellm requests**
+
+:::
+
+## API Key
+```python
+# env variable
+os.environ['GITHUB_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['GITHUB_API_KEY'] = ""
+response = completion(
+    model="github/llama3-8b-8192", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['GITHUB_API_KEY'] = ""
+response = completion(
+    model="github/llama3-8b-8192", 
+    messages=[
+       {"role": "user", "content": "hello from litellm"}
+   ],
+    stream=True
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+
+## Usage with LiteLLM Proxy 
+
+### 1. Set Github Models on config.yaml
+
+```yaml
+model_list:
+  - model_name: github-llama3-8b-8192 # Model Alias to use for requests
+    litellm_params:
+      model: github/llama3-8b-8192
+      api_key: "os.environ/GITHUB_API_KEY" # ensure you have `GITHUB_API_KEY` in your .env
+```
+
+### 2. Start Proxy 
+
+```
+litellm --config config.yaml
+```
+
+### 3. Test it
+
+Make request to litellm proxy
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "github-llama3-8b-8192",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+response = client.chat.completions.create(model="github-llama3-8b-8192", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "github-llama3-8b-8192",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+
+## Supported Models - ALL Github Models Supported!
+We support ALL Github models, just set `github/` as a prefix when sending completion requests
+
+| Model Name         | Usage                                           |
+|--------------------|---------------------------------------------------------|
+| llama-3.1-8b-instant     | `completion(model="github/llama-3.1-8b-instant", messages)`     | 
+| llama-3.1-70b-versatile    | `completion(model="github/llama-3.1-70b-versatile", messages)`    | 
+| llama-3.1-405b-reasoning    | `completion(model="github/llama-3.1-405b-reasoning", messages)`    | 
+| llama3-8b-8192     | `completion(model="github/llama3-8b-8192", messages)`     | 
+| llama3-70b-8192    | `completion(model="github/llama3-70b-8192", messages)`    | 
+| llama2-70b-4096    | `completion(model="github/llama2-70b-4096", messages)`    | 
+| mixtral-8x7b-32768 | `completion(model="github/mixtral-8x7b-32768", messages)` |
+| gemma-7b-it        | `completion(model="github/gemma-7b-it", messages)`        |  
+
+## Github - Tool / Function Calling Example
+
+```python
+# Example dummy function hard coded to return the current weather
+import json
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    if "tokyo" in location.lower():
+        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
+    elif "san francisco" in location.lower():
+        return json.dumps(
+            {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
+        )
+    elif "paris" in location.lower():
+        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+
+
+# Step 1: send the conversation and available functions to the model
+messages = [
+    {
+        "role": "system",
+        "content": "You are a function calling LLM that uses the data extracted from get_current_weather to answer questions about the weather in San Francisco.",
+    },
+    {
+        "role": "user",
+        "content": "What's the weather like in San Francisco?",
+    },
+]
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+response = litellm.completion(
+    model="github/llama3-8b-8192",
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",  # auto is default, but we'll be explicit
+)
+print("Response\n", response)
+response_message = response.choices[0].message
+tool_calls = response_message.tool_calls
+
+
+# Step 2: check if the model wanted to call a function
+if tool_calls:
+    # Step 3: call the function
+    # Note: the JSON response may not always be valid; be sure to handle errors
+    available_functions = {
+        "get_current_weather": get_current_weather,
+    }
+    messages.append(
+        response_message
+    )  # extend conversation with assistant's reply
+    print("Response message\n", response_message)
+    # Step 4: send the info for each function call and function response to the model
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+        function_to_call = available_functions[function_name]
+        function_args = json.loads(tool_call.function.arguments)
+        function_response = function_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+        messages.append(
+            {
+                "tool_call_id": tool_call.id,
+                "role": "tool",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+    print(f"messages: {messages}")
+    second_response = litellm.completion(
+        model="github/llama3-8b-8192", messages=messages
+    )  # get a new response from the model where it can see the function response
+    print("second response\n", second_response)
+```
--- a/docs/my-website/docs/providers/ollama.md
+++ b/docs/my-website/docs/providers/ollama.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Ollama 
 LiteLLM supports all models from [Ollama](https://github.com/ollama/ollama)

@ -84,6 +87,120 @@ response = completion(
 )
 ```

+## Example Usage - Tool Calling 
+
+To use ollama tool calling, pass `tools=[{..}]` to `litellm.completion()` 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import litellm 
+
+## [OPTIONAL] REGISTER MODEL - not all ollama models support function calling, litellm defaults to json mode tool calls if native tool calling not supported.
+
+# litellm.register_model(model_cost={
+#                 "ollama_chat/llama3.1": { 
+#                   "supports_function_calling": true
+#                 },
+#             })
+
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather in a given location",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+          },
+          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+        },
+        "required": ["location"],
+      },
+    }
+  }
+]
+
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+
+response = completion(
+  model="ollama_chat/llama3.1",
+  messages=messages,
+  tools=tools
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: "llama3.1"             
+    litellm_params:
+      model: "ollama_chat/llama3.1"
+    model_info:
+      supports_function_calling: true
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+    "model": "llama3.1",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What'\''s the weather like in Boston today?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+  "stream": true
+}'
+```
+</TabItem>
+</Tabs>
+
 ## Using ollama `api/chat` 
 In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat`

--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -166,6 +166,7 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base"     # OPTIONAL
 | gpt-4o-mini  | `response = completion(model="gpt-4o-mini", messages=messages)` |
 | gpt-4o-mini-2024-07-18   | `response = completion(model="gpt-4o-mini-2024-07-18", messages=messages)` |
 | gpt-4o   | `response = completion(model="gpt-4o", messages=messages)` |
+| gpt-4o-2024-08-06   | `response = completion(model="gpt-4o-2024-08-06", messages=messages)` |
 | gpt-4o-2024-05-13   | `response = completion(model="gpt-4o-2024-05-13", messages=messages)` |
 | gpt-4-turbo   | `response = completion(model="gpt-4-turbo", messages=messages)` |
 | gpt-4-turbo-preview   | `response = completion(model="gpt-4-0125-preview", messages=messages)` |
--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -775,7 +775,6 @@ vertex_ai_location = "your-vertex-location" # can also set this as os.environ["V
 response = completion(
    model="vertex_ai/" + model,
    messages=[{"role": "user", "content": "hi"}],
-    temperature=0.7,
    vertex_ai_project=vertex_ai_project,
    vertex_ai_location=vertex_ai_location,
 )
@ -828,6 +827,178 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 </TabItem>
 </Tabs>

+## Mistral API
+
+[**Supported OpenAI Params**](https://github.com/BerriAI/litellm/blob/e0f3cd580cb85066f7d36241a03c30aa50a8a31d/litellm/llms/openai.py#L137)
+ 
+| Model Name       | Function Call                        |
+|------------------|--------------------------------------|
+| mistral-large@latest   | `completion('vertex_ai/mistral-large@latest', messages)` |
+| mistral-large@2407   | `completion('vertex_ai/mistral-large@2407', messages)` |
+| mistral-nemo@latest   | `completion('vertex_ai/mistral-nemo@latest', messages)` |
+| codestral@latest   | `completion('vertex_ai/codestral@latest', messages)` |
+| codestral@@2405   | `completion('vertex_ai/codestral@2405', messages)` |
+
+### Usage
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
+
+model = "mistral-large@2407"
+
+vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
+vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
+
+response = completion(
+    model="vertex_ai/" + model,
+    messages=[{"role": "user", "content": "hi"}],
+    vertex_ai_project=vertex_ai_project,
+    vertex_ai_location=vertex_ai_location,
+)
+print("\nModel Response", response)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: vertex-mistral
+      litellm_params:
+        model: vertex_ai/mistral-large@2407
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-east-1"
+    - model_name: vertex-mistral
+      litellm_params:
+        model: vertex_ai/mistral-large@2407
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-west-1"
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+            "model": "vertex-mistral", # 👈 the 'model_name' in config
+            "messages": [
+                {
+                "role": "user",
+                "content": "what llm are you"
+                }
+            ],
+        }'
+```
+
+</TabItem>
+</Tabs>
+
+
+
+### Usage - Codestral FIM
+
+Call Codestral on VertexAI via the OpenAI [`/v1/completion`](https://platform.openai.com/docs/api-reference/completions/create) endpoint for FIM tasks. 
+
+Note: You can also call Codestral via `/chat/completion`.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+import os
+
+# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
+# OR run `!gcloud auth print-access-token` in your terminal
+
+model = "codestral@2405"
+
+vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
+vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
+
+response = text_completion(
+    model="vertex_ai/" + model,
+    vertex_ai_project=vertex_ai_project,
+    vertex_ai_location=vertex_ai_location,
+    prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():", 
+    suffix="return True",                                              # optional
+    temperature=0,                                                     # optional
+    top_p=1,                                                           # optional
+    max_tokens=10,                                                     # optional
+    min_tokens=10,                                                     # optional
+    seed=10,                                                           # optional
+    stop=["return"],                                                   # optional
+)
+
+print("\nModel Response", response)
+```
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+**1. Add to config**
+
+```yaml
+model_list:
+    - model_name: vertex-codestral
+      litellm_params:
+        model: vertex_ai/codestral@2405
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-east-1"
+    - model_name: vertex-codestral
+      litellm_params:
+        model: vertex_ai/codestral@2405
+        vertex_ai_project: "my-test-project"
+        vertex_ai_location: "us-west-1"
+```
+
+**2. Start proxy**
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING at http://0.0.0.0:4000
+```
+
+**3. Test it!**
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/completions' \
+      -H 'Authorization: Bearer sk-1234' \
+      -H 'Content-Type: application/json' \
+      -d '{
+            "model": "vertex-codestral", # 👈 the 'model_name' in config
+            "prompt": "def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():", 
+            "suffix":"return True",                                              # optional
+            "temperature":0,                                                     # optional
+            "top_p":1,                                                           # optional
+            "max_tokens":10,                                                     # optional
+            "min_tokens":10,                                                     # optional
+            "seed":10,                                                           # optional
+            "stop":["return"],                                                   # optional
+        }'
+```
+
+</TabItem>
+</Tabs>
+
+
 ## Model Garden
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
--- a/docs/my-website/docs/proxy/bucket.md
+++ b/docs/my-website/docs/proxy/bucket.md
@ -0,0 +1,191 @@
+
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 🪣 Logging GCS, s3 Buckets
+
+LiteLLM Supports Logging to the following Cloud Buckets
+- (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets)
+- (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets) 
+
+## Logging Proxy Input/Output to Google Cloud Storage Buckets
+
+Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en)
+
+:::info
+
+✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+
+
+### Usage
+
+1. Add `gcs_bucket` to LiteLLM Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE
+```
+
+2. Set required env variables
+
+```shell
+GCS_BUCKET_NAME="<your-gcs-bucket-name>"
+GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
+```
+
+3. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+4. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+
+### Expected Logs on GCS Buckets
+
+<Image img={require('../../img/gcs_bucket.png')} />
+
+
+### Fields Logged on GCS Buckets
+
+Example payload of a `/chat/completion` request logged on GCS
+```json
+{
+  "request_kwargs": {
+    "model": "gpt-3.5-turbo",
+    "messages": [
+      {
+        "role": "user",
+        "content": "This is a test"
+      }
+    ],
+    "optional_params": {
+      "temperature": 0.7,
+      "max_tokens": 10,
+      "user": "ishaan-2",
+      "extra_body": {}
+    }
+  },
+  "response_obj": {
+    "id": "chatcmpl-bd836a8c-89bc-4abd-bee5-e3f1ebfdb541",
+    "choices": [
+      {
+        "finish_reason": "stop",
+        "index": 0,
+        "message": {
+          "content": "Hi!",
+          "role": "assistant",
+          "tool_calls": null,
+          "function_call": null
+        }
+      }
+    ],
+    "created": 1722868456,
+    "model": "gpt-3.5-turbo",
+    "object": "chat.completion",
+    "system_fingerprint": null,
+    "usage": {
+      "prompt_tokens": 10,
+      "completion_tokens": 20,
+      "total_tokens": 30
+    }
+  },
+  "start_time": "2024-08-05 07:34:16",
+  "end_time": "2024-08-05 07:34:16"
+}
+```
+
+### Getting `service_account.json` from Google Cloud Console
+
+1. Go to [Google Cloud Console](https://console.cloud.google.com/)
+2. Search for IAM & Admin
+3. Click on Service Accounts
+4. Select a Service Account
+5. Click on 'Keys' -> Add Key -> Create New Key -> JSON
+6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT`
+
+
+## Logging Proxy Input/Output - s3 Buckets
+
+We will use the `--config` to set 
+
+- `litellm.success_callback = ["s3"]` 
+
+This will log all successfull LLM calls to s3 Bucket
+
+**Step 1** Set AWS Credentials in .env
+
+```shell
+AWS_ACCESS_KEY_ID = ""
+AWS_SECRET_ACCESS_KEY = ""
+AWS_REGION_NAME = ""
+```
+
+**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  success_callback: ["s3"]
+  s3_callback_params:
+    s3_bucket_name: logs-bucket-litellm   # AWS Bucket Name for S3
+    s3_region_name: us-west-2              # AWS Region Name for S3
+    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
+    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
+    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
+    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
+```
+
+**Step 3**: Start the proxy, make a test request
+
+Start proxy
+
+```shell
+litellm --config config.yaml --debug
+```
+
+Test Request
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --data ' {
+    "model": "Azure OpenAI GPT-4 East",
+    "messages": [
+        {
+        "role": "user",
+        "content": "what llm are you"
+        }
+    ]
+    }'
+```
+
+Your logs should be available on the specified s3 Bucket
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@ -260,6 +260,21 @@ curl --location 'http://0.0.0.0:4000/cache/ping'  -H "Authorization: Bearer sk-1
 ```

 ## Advanced
+
+### Control Call Types Caching is on for - (`/chat/completion`, `/embeddings`, etc.)
+
+By default, caching is on for all call types. You can control which call types caching is on for by setting `supported_call_types` in `cache_params`
+
+**Cache will only be on for the call types specified in `supported_call_types`**
+
+```yaml
+litellm_settings:
+  cache: True
+  cache_params:
+    type: redis
+    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
+                          # /chat/completions, /completions, /embeddings, /audio/transcriptions
+```
 ### Set Cache Params on config.yaml
 ```yaml
 model_list:
@ -280,7 +295,8 @@ litellm_settings:
    password: "your_password"  # The password for the Redis cache. Required if type is "redis".
    
    # Optional configurations
-    supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types
+    supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
+                      # /chat/completions, /completions, /embeddings, /audio/transcriptions
 ```

 ### Turn on / off caching per request.  
@ -625,11 +641,8 @@ cache_params:

  # List of litellm call types to cache for
  # Options: "completion", "acompletion", "embedding", "aembedding"
-  supported_call_types:
-    - completion
-    - acompletion
-    - embedding
-    - aembedding
+  supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"]
+                      # /chat/completions, /completions, /embeddings, /audio/transcriptions

  # Redis cache parameters
  host: localhost  # Redis server hostname or IP address
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -60,6 +60,13 @@ model_list:
    model_info: 
      version: 2
  
+  # Use this if you want to make requests to `claude-3-haiku-20240307`,`claude-3-opus-20240229`,`claude-2.1` without defining them on the config.yaml
+  # Default models
+  # Works for ALL Providers and needs the default provider credentials in .env
+  - model_name: "*" 
+    litellm_params:
+      model: "*"
+
 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
  success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
@ -288,7 +295,7 @@ Dynamically call any model from any given provider without the need to predefine
 model_list:
  - model_name: "*"             # all requests where model not in your config go to this deployment
    litellm_params:
-      model: "openai/*"           # passes our validation check that a real provider is given
+      model: "*"           # passes our validation check that a real provider is given
 ```

 2. Start LiteLLM proxy 
--- a/docs/my-website/docs/proxy/custom_pricing.md
+++ b/docs/my-website/docs/proxy/custom_pricing.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';

-# Custom Pricing - Sagemaker, etc. 
+# Custom LLM Pricing - Sagemaker, Azure, etc

 Use this to register custom pricing for models. 

@ -16,39 +16,9 @@ LiteLLM already has pricing for any model in our [model cost map](https://github

 :::

-## Quick Start 
+## Cost Per Second (e.g. Sagemaker)

-Register custom pricing for sagemaker completion model. 
-
-For cost per second pricing, you **just** need to register `input_cost_per_second`. 
-
-```python
-# !pip install boto3 
-from litellm import completion, completion_cost 
-
-os.environ["AWS_ACCESS_KEY_ID"] = ""
-os.environ["AWS_SECRET_ACCESS_KEY"] = ""
-os.environ["AWS_REGION_NAME"] = ""
-
-
-def test_completion_sagemaker():
-    try:
-        print("testing sagemaker")
-        response = completion(
-            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
-            messages=[{"role": "user", "content": "Hey, how's it going?"}],
-            input_cost_per_second=0.000420,
-        )
-        # Add any assertions here to check the response
-        print(response)
-        cost = completion_cost(completion_response=response)
-        print(cost)
-    except Exception as e:
-        raise Exception(f"Error occurred: {e}")
-
-```
-
-### Usage with OpenAI Proxy Server
+### Usage with LiteLLM Proxy Server

 **Step 1: Add pricing to config.yaml**
 ```yaml
@ -75,38 +45,7 @@ litellm /path/to/config.yaml

 ## Cost Per Token (e.g. Azure)

-
-```python
-# !pip install boto3 
-from litellm import completion, completion_cost 
-
-## set ENV variables
-os.environ["AZURE_API_KEY"] = ""
-os.environ["AZURE_API_BASE"] = ""
-os.environ["AZURE_API_VERSION"] = ""
-
-
-def test_completion_azure_model():
-    try:
-        print("testing azure custom pricing")
-        # azure call
-        response = completion(
-          model = "azure/<your_deployment_name>", 
-          messages = [{ "content": "Hello, how are you?","role": "user"}]
-          input_cost_per_token=0.005,
-          output_cost_per_token=1,
-        )
-        # Add any assertions here to check the response
-        print(response)
-        cost = completion_cost(completion_response=response)
-        print(cost)
-    except Exception as e:
-        raise Exception(f"Error occurred: {e}")
-
-test_completion_azure_model()
-```
-
-### Usage with OpenAI Proxy Server
+### Usage with LiteLLM Proxy Server

 ```yaml
 model_list:
--- a/docs/my-website/docs/proxy/debugging.md
+++ b/docs/my-website/docs/proxy/debugging.md
@ -35,6 +35,22 @@ $ litellm --detailed_debug
 os.environ["LITELLM_LOG"] = "DEBUG"
 ```

+### Debug Logs 
+
+Run the proxy with `--detailed_debug` to view detailed debug logs
+```shell
+litellm --config /path/to/config.yaml --detailed_debug
+```
+
+When making requests you should see the POST request sent by LiteLLM to the LLM on the Terminal output
+```shell
+POST Request Sent from LiteLLM:
+curl -X POST \
+https://api.openai.com/v1/chat/completions \
+-H 'content-type: application/json' -H 'Authorization: Bearer sk-qnWGUIW9****************************************' \
+-d '{"model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": "this is a test request, write a short poem"}]}'
+```
+
 ## JSON LOGS

 Set `JSON_LOGS="True"` in your env:
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -246,7 +246,7 @@ helm install lite-helm ./litellm-helm
 kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
 ```

-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.

 </TabItem>

@ -254,6 +254,15 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.

 **That's it ! That's the quick start to deploy litellm**

+## Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
+
+:::info
+💡 Go here 👉 [to make your first LLM API Request](user_keys)
+
+LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
+
+:::
+
 ## Options to deploy LiteLLM 

 | Docs | When to Use |
@ -292,7 +301,7 @@ docker run \
    --config /app/config.yaml --detailed_debug
 ```

-Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
+Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.

 </TabItem>
 <TabItem value="kubernetes-deploy" label="Kubernetes">
@ -390,7 +399,7 @@ kubectl apply -f /path/to/service.yaml
 kubectl port-forward service/litellm-service 4000:4000
 ```

-Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
+Your LiteLLM Proxy Server is now running on `http://0.0.0.0:4000`.

 </TabItem>

@ -432,7 +441,7 @@ kubectl \
  4000:4000
 ```

-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.


 If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/values.yaml)
@ -477,7 +486,7 @@ helm install lite-helm ./litellm-helm
 kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT
 ```

-Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
+Your LiteLLM Proxy Server is now running on `http://127.0.0.1:4000`.

 </TabItem>
 </Tabs>
@ -549,6 +558,39 @@ docker run --name litellm-proxy \
 ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
 ```

+## LiteLLM without Internet Connection
+
+By default `prisma generate` downloads [prisma's engine binaries](https://www.prisma.io/docs/orm/reference/environment-variables-reference#custom-engine-file-locations). This might cause errors when running without internet connection. 
+
+Use this dockerfile to build an image which pre-generates the prisma binaries.
+
+```Dockerfile
+# Use the provided base image
+FROM ghcr.io/berriai/litellm:main-latest
+
+# Set the working directory to /app
+WORKDIR /app
+
+### [👇 KEY STEP] ###
+# Install Prisma CLI and generate Prisma client
+RUN pip install prisma 
+RUN prisma generate
+### FIN #### 
+
+
+# Expose the necessary port
+EXPOSE 4000
+
+# Override the CMD instruction with your desired command and arguments
+# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
+# CMD ["--port", "4000", "--config", "config.yaml"]
+
+# Define the command to run your app
+ENTRYPOINT ["litellm"]
+
+CMD ["--port", "4000"]
+```
+
 ## Advanced Deployment Settings

 ### 1. Customization of the server root path (custom Proxy base url)
@ -563,24 +605,87 @@ In a Kubernetes deployment, it's possible to utilize a shared DNS to host multip

 Customize the root path to eliminate the need for employing multiple DNS configurations during deployment.

+Step 1.
 👉 Set `SERVER_ROOT_PATH` in your .env and this will be set as your server root path
 ```
 export SERVER_ROOT_PATH="/api/v1"
 ```

-**Step 1. Run Proxy with `SERVER_ROOT_PATH` set in your env **
+**Step 2** (If you want the Proxy Admin UI to work with your root path you need to use this dockerfile)
+- Use the dockerfile below (it uses litellm as a base image)
+- 👉 Set `UI_BASE_PATH=$SERVER_ROOT_PATH/ui` in the Dockerfile, example `UI_BASE_PATH=/api/v1/ui`
+
+Dockerfile

 ```shell
-docker run --name litellm-proxy \
-e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
-e SERVER_ROOT_PATH="/api/v1" \
-p 4000:4000 \
-ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
+# Use the provided base image
+FROM ghcr.io/berriai/litellm:main-latest
+
+# Set the working directory to /app
+WORKDIR /app
+
+# Install Node.js and npm (adjust version as needed)
+RUN apt-get update && apt-get install -y nodejs npm
+
+# Copy the UI source into the container
+COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard
+
+# Set an environment variable for UI_BASE_PATH
+# This can be overridden at build time
+# set UI_BASE_PATH to "<your server root path>/ui"
+# 👇👇 Enter your UI_BASE_PATH here
+ENV UI_BASE_PATH="/api/v1/ui" 
+
+# Build the UI with the specified UI_BASE_PATH
+WORKDIR /app/ui/litellm-dashboard
+RUN npm install
+RUN UI_BASE_PATH=$UI_BASE_PATH npm run build
+
+# Create the destination directory
+RUN mkdir -p /app/litellm/proxy/_experimental/out
+
+# Move the built files to the appropriate location
+# Assuming the build output is in ./out directory
+RUN rm -rf /app/litellm/proxy/_experimental/out/* && \
+    mv ./out/* /app/litellm/proxy/_experimental/out/
+
+# Switch back to the main app directory
+WORKDIR /app
+
+# Make sure your entrypoint.sh is executable
+RUN chmod +x entrypoint.sh
+
+# Expose the necessary port
+EXPOSE 4000/tcp
+
+# Override the CMD instruction with your desired command and arguments
+# only use --detailed_debug for debugging
+CMD ["--port", "4000", "--config", "config.yaml"]
+```
+
+**Step 3** build this Dockerfile
+
+```shell
+docker build -f Dockerfile -t litellm-prod-build . --progress=plain
+```
+
+**Step 4. Run Proxy with `SERVER_ROOT_PATH` set in your env **
+
+```shell
+docker run \
+    -v $(pwd)/proxy_config.yaml:/app/config.yaml \
+    -p 4000:4000 \
+    -e LITELLM_LOG="DEBUG"\
+    -e SERVER_ROOT_PATH="/api/v1"\
+    -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
+    -e LITELLM_MASTER_KEY="sk-1234"\
+    litellm-prod-build \
+    --config /app/config.yaml
 ```

 After running the proxy you can access it on `http://0.0.0.0:4000/api/v1/` (since we set `SERVER_ROOT_PATH="/api/v1"`)

-**Step 2. Verify Running on correct path**
+**Step 5. Verify Running on correct path**

 <Image img={require('../../img/custom_root_path.png')} />

@ -785,3 +890,31 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in


 Your LiteLLM container should be running now on the defined port e.g. `4000`.
+
+### IAM-based Auth for RDS DB 
+
+1. Set AWS env var 
+
+```bash
+export AWS_WEB_IDENTITY_TOKEN='/path/to/token'
+export AWS_ROLE_NAME='arn:aws:iam::123456789012:role/MyRole'
+export AWS_SESSION_NAME='MySession'
+```
+
+[**See all Auth options**](https://github.com/BerriAI/litellm/blob/089a4f279ad61b7b3e213d8039fb9b75204a7abc/litellm/proxy/auth/rds_iam_token.py#L165)
+
+2. Add RDS credentials to env
+
+```bash
+export DATABASE_USER="db-user"
+export DATABASE_PORT="5432"
+export DATABASE_HOST="database-1-instance-1.cs1ksmwz2xt3.us-west-2.rds.amazonaws.com"
+export DATABASE_NAME="database-1-instance-1"
+```
+
+3. Run proxy with iam+rds
+
+
+```bash
+litellm --config /path/to/config.yaml --iam_token_db_auth
+```
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -21,10 +21,14 @@ Features:
    - ✅ IP address‑based access control lists
    - ✅ Track Request IP Address
    - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
-    - ✅ Set Max Request / File Size on Requests
+    - ✅ [Set Max Request Size / File Size on Requests](#set-max-request--response-size-on-litellm-proxy)
    - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)
- **Enterprise Spend Tracking Features**
+- **Customize Logging, Guardrails, Caching per project**
+    - ✅ [Team Based Logging](./team_logging.md) - Allow each team to use their own Langfuse Project / custom callbacks
+    - ✅ [Disable Logging for a Team](./team_logging.md#disable-logging-for-a-team) - Switch off all logging for a team/project (GDPR Compliance)
+-- **Spend Tracking & Data Exports**
    - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
+    - ✅ [Exporting LLM Logs to GCS Bucket](./proxy/bucket#🪣-logging-gcs-s3-buckets)
    - ✅ [`/spend/report` API endpoint](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
 - **Advanced Metrics**
    - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
@ -1288,3 +1292,52 @@ How it works?

 **Note:** Setting an environment variable within a Python script using os.environ will not make that variable accessible via SSH sessions or any other new processes that are started independently of the Python script. Environment variables set this way only affect the current process and its child processes.

+
+## Set Max Request / Response Size on LiteLLM Proxy
+
+Use this if you want to set a maximum request / response size for your proxy server. If a request size is above the size it gets rejected + slack alert triggered
+
+#### Usage 
+**Step 1.** Set `max_request_size_mb` and `max_response_size_mb`
+
+For this example we set a very low limit on `max_request_size_mb` and expect it to get rejected 
+
+:::info
+In production we recommend setting a `max_request_size_mb` /  `max_response_size_mb` around `32 MB`
+
+:::
+
+```yaml
+model_list:
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/fake
+      api_key: fake-key
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+general_settings: 
+  master_key: sk-1234
+
+  # Security controls
+  max_request_size_mb: 0.000000001 # 👈 Key Change - Max Request Size in MB. Set this very low for testing 
+  max_response_size_mb: 100 # 👈 Key Change - Max Response Size in MB
+```
+
+**Step 2.** Test it with `/chat/completions` request
+
+```shell
+curl http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "fake-openai-endpoint",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude!"}
+    ]
+  }'
+```
+
+**Expected Response from request**
+We expect this to fail since the request size is over `max_request_size_mb`
+```shell
+{"error":{"message":"Request size is too large. Request size is 0.0001125335693359375 MB. Max size is 1e-09 MB","type":"bad_request_error","param":"content-length","code":400}}
+```
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -8,7 +8,6 @@ Log Proxy input, output, and exceptions using:
 - Langsmith
 - DataDog
 - DynamoDB
- s3 Bucket
 - etc.

 import Image from '@theme/IdealImage';
@ -714,6 +713,23 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector

 <Image img={require('../../img/otel_parent.png')} />

+### Forwarding `Traceparent HTTP Header` to LLM APIs
+
+Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM
+
+Set `forward_traceparent_to_llm_provider: True` in your `config.yaml`. This will forward the `traceparent` header to your LLM API
+
+:::warning
+
+Only use this for self hosted LLMs, this can cause Bedrock, VertexAI calls to fail
+
+:::
+
+```yaml
+litellm_settings:
+  forward_traceparent_to_llm_provider: True
+```
+
 ## Custom Callback Class [Async]

 Use this when you want to run custom callbacks in `python`
@ -1362,66 +1378,6 @@ Expected output on Datadog

 <Image img={require('../../img/dd_small1.png')} />

-## Logging Proxy Input/Output - s3 Buckets
-
-We will use the `--config` to set 
-
- `litellm.success_callback = ["s3"]` 
-
-This will log all successfull LLM calls to s3 Bucket
-
-**Step 1** Set AWS Credentials in .env
-
-```shell
-AWS_ACCESS_KEY_ID = ""
-AWS_SECRET_ACCESS_KEY = ""
-AWS_REGION_NAME = ""
-```
-
-**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
-
-```yaml
-model_list:
- - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: gpt-3.5-turbo
-litellm_settings:
-  success_callback: ["s3"]
-  s3_callback_params:
-    s3_bucket_name: logs-bucket-litellm   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
-    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
-    s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
-```
-
-**Step 3**: Start the proxy, make a test request
-
-Start proxy
-
-```shell
-litellm --config config.yaml --debug
-```
-
-Test Request
-
-```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Content-Type: application/json' \
-    --data ' {
-    "model": "Azure OpenAI GPT-4 East",
-    "messages": [
-        {
-        "role": "user",
-        "content": "what llm are you"
-        }
-    ]
-    }'
-```
-
-Your logs should be available on the specified s3 Bucket
-
 ## Logging Proxy Input/Output - DynamoDB

 We will use the `--config` to set 
--- a/docs/my-website/docs/proxy/pass_through.md
+++ b/docs/my-website/docs/proxy/pass_through.md
@ -35,6 +35,7 @@ general_settings:
        Authorization: "bearer os.environ/COHERE_API_KEY" # (Optional) Auth Header to forward to your Endpoint
        content-type: application/json                    # (Optional) Extra Headers to pass to this endpoint 
        accept: application/json
+      forward_headers: True                      # (Optional) Forward all headers from the incoming request to the target endpoint
 ```

 **Step 2** Start Proxy Server in detailed_debug mode
@ -220,6 +221,7 @@ general_settings:
    * `LANGFUSE_PUBLIC_KEY` *string*: Your Langfuse account public key - only set this when forwarding to Langfuse.
    * `LANGFUSE_SECRET_KEY` *string*: Your Langfuse account secret key - only set this when forwarding to Langfuse.
    * `<your-custom-header>` *string*: Pass any custom header key/value pair 
+  * `forward_headers` *Optional(boolean)*: If true, all headers from the incoming request will be forwarded to the target endpoint. Default is `False`.


 ## Custom Chat Endpoints (Anthropic/Bedrock/Vertex)
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@ -84,6 +84,20 @@ Set `export LITELLM_MODE="PRODUCTION"`

 This disables the load_dotenv() functionality, which will automatically load your environment credentials from the local `.env`. 

+## 5. Set LiteLLM Salt Key 
+
+If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB. 
+
+Do not change this after adding a model. It is used to encrypt / decrypt your LLM API Key credentials
+
+We recommned - https://1password.com/password-generator/ password generator to get a random hash for litellm salt key.
+
+```bash
+export LITELLM_SALT_KEY="sk-1234"
+```
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15)
+
 ## Extras
 ### Expected Performance in Production

--- a/docs/my-website/docs/proxy/prompt_injection.md
+++ b/docs/my-website/docs/proxy/prompt_injection.md
@ -13,7 +13,7 @@ LiteLLM Supports the following methods for detecting prompt injection attacks

 Use this if you want to reject /chat, /completions, /embeddings calls that have prompt injection attacks

-LiteLLM uses [LakerAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack
+LiteLLM uses [LakeraAI API](https://platform.lakera.ai/) to detect if a request has a prompt injection attack

 ### Usage

--- a/docs/my-website/docs/proxy/quick_start.md
+++ b/docs/my-website/docs/proxy/quick_start.md
@ -255,6 +255,12 @@ litellm --config your_config.yaml

 ## Using LiteLLM Proxy - Curl Request, OpenAI Package, Langchain

+:::info
+LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, Mistral SDK, LLamaIndex, Langchain (Js, Python)
+
+[More examples here](user_keys)
+:::
+
 <Tabs>
 <TabItem value="Curl" label="Curl Request">

@ -382,6 +388,34 @@ print(response)

 ```
 </TabItem>
+
+<TabItem value="anthropic-py" label="Anthropic Python SDK">
+
+```python
+import os
+
+from anthropic import Anthropic
+
+client = Anthropic(
+    base_url="http://localhost:4000", # proxy endpoint
+    api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
+)
+
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-opus-20240229",
+)
+print(message.content)
+```
+
+</TabItem>
+
 </Tabs>

 [**More Info**](./configs.md)
@ -396,165 +430,6 @@ print(response)
 - POST `/key/generate` - generate a key to access the proxy


-## Using with OpenAI compatible projects
-Set `base_url` to the LiteLLM Proxy server
-
-<Tabs>
-<TabItem value="openai" label="OpenAI v1.0.0+">
-
-```python
-import openai
-client = openai.OpenAI(
-    api_key="anything",
-    base_url="http://0.0.0.0:4000"
-)
-
-# request sent to model set on litellm proxy, `litellm --model`
-response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
-    {
-        "role": "user",
-        "content": "this is a test request, write a short poem"
-    }
-])
-
-print(response)
-
-```
-</TabItem>
-<TabItem value="librechat" label="LibreChat">
-
-#### Start the LiteLLM proxy
-```shell
-litellm --model gpt-3.5-turbo
-
-#INFO: Proxy running on http://0.0.0.0:4000
-```
-
-#### 1. Clone the repo
-
-```shell
-git clone https://github.com/danny-avila/LibreChat.git
-```
-
-
-#### 2. Modify Librechat's `docker-compose.yml`
-LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
-```yaml
-OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
-```
-
-#### 3. Save fake OpenAI key in Librechat's `.env` 
-
-Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
-```env
-OPENAI_API_KEY=sk-1234
-```
-
-#### 4. Run LibreChat: 
-```shell
-docker compose up
-```
-</TabItem>
-
-<TabItem value="continue-dev" label="ContinueDev">
-
-Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
-
-In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
-```python
-  default=OpenAI(
-      api_key="IGNORED",
-      model="fake-model-name",
-      context_length=2048, # customize if needed for your model
-      api_base="http://localhost:4000" # your proxy server url
-  ),
-```
-
-Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. 
-</TabItem>
-
-<TabItem value="aider" label="Aider">
-
-```shell
-$ pip install aider 
-
-$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
-```
-</TabItem>
-<TabItem value="autogen" label="AutoGen">
-
-```python
-pip install pyautogen
-```
-
-```python
-from autogen import AssistantAgent, UserProxyAgent, oai
-config_list=[
-    {
-        "model": "my-fake-model",
-        "api_base": "http://localhost:4000",  #litellm compatible endpoint
-        "api_type": "open_ai",
-        "api_key": "NULL", # just a placeholder
-    }
-]
-
-response = oai.Completion.create(config_list=config_list, prompt="Hi")
-print(response) # works fine
-
-llm_config={
-    "config_list": config_list,
-}
-
-assistant = AssistantAgent("assistant", llm_config=llm_config)
-user_proxy = UserProxyAgent("user_proxy")
-user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
-```
-
-Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
-</TabItem>
-
-<TabItem value="guidance" label="guidance">
-A guidance language for controlling large language models.
-https://github.com/guidance-ai/guidance
-
-**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. 
-
-**Fix**: Start your proxy using the `--drop_params` flag
-
-```shell
-litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
-```
-
-```python
-import guidance
-
-# set api_base to your proxy
-# set api_key to anything
-gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
-
-experts = guidance('''
-{{#system~}}
-You are a helpful and terse assistant.
-{{~/system}}
-
-{{#user~}}
-I want a response to the following question:
-{{query}}
-Name 3 world-class experts (past or present) who would be great at answering this?
-Don't answer the question yet.
-{{~/user}}
-
-{{#assistant~}}
-{{gen 'expert_names' temperature=0 max_tokens=300}}
-{{~/assistant}}
-''', llm=gpt4)
-
-result = experts(query='How can I be more productive?')
-print(result)
-```
-</TabItem>
-</Tabs>
-
 ## Debugging Proxy 

 Events that occur during normal operation
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -50,7 +50,7 @@ Detailed information about [routing strategies can be found here](../routing)
 $ litellm --config /path/to/config.yaml
 ```

-### Test - Load Balancing
+### Test - Simple Call

 Here requests with model=gpt-3.5-turbo will be routed across multiple instances of azure/gpt-3.5-turbo

@ -138,6 +138,27 @@ print(response)
 </Tabs>


+### Test - Loadbalancing
+
+In this request, the following will occur:
+1. A rate limit exception will be raised 
+2. LiteLLM proxy will retry the request on the model group (default is 3).
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "gpt-3.5-turbo",
+  "messages": [
+        {"role": "user", "content": "Hi there!"}
+    ],
+    "mock_testing_rate_limit_error": true
+}'
+```
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/6b8806b45f970cb2446654d2c379f8dcaa93ce3c/litellm/router.py#L2535)
+
 ### Test - Client Side Fallbacks
 In this request the following will occur:
 1. The request to `model="zephyr-beta"` will fail
--- a/docs/my-website/docs/proxy/team_logging.md
+++ b/docs/my-website/docs/proxy/team_logging.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 👥📊 Team Based Logging
+# 👥📊 [BETA] Team Based Logging

 Allow each team to use their own Langfuse Project / custom callbacks

@ -11,7 +11,14 @@ Allow each team to use their own Langfuse Project / custom callbacks
 Team 1 -> Logs to Langfuse Project 1 
 Team 2 -> Logs to Langfuse Project 2
 Team 3 -> Disabled Logging (for GDPR compliance)
+
 ```
+:::info
+
+✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
+
+:::
+

 ## Set Callbacks Per Team

--- a/docs/my-website/docs/proxy/user_keys.md
+++ b/docs/my-website/docs/proxy/user_keys.md
@ -1,7 +1,43 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# Use with Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl
+# 💡 Migrating from OpenAI (Langchain, OpenAI SDK, LlamaIndex, Instructor, Curl)
+
+LiteLLM Proxy is **OpenAI-Compatible**, and supports:
+* /chat/completions 
+* /embeddings
+* /completions 
+* /image/generations 
+* /moderations 
+* /audio/transcriptions
+* /audio/speech
+* [Assistants API endpoints](https://docs.litellm.ai/docs/assistants)
+* [Batches API endpoints](https://docs.litellm.ai/docs/batches)
+* [Fine-Tuning API endpoints](https://docs.litellm.ai/docs/fine_tuning)
+
+LiteLLM Proxy is **Azure OpenAI-compatible**:
+* /chat/completions
+* /completions
+* /embeddings 
+
+LiteLLM Proxy is **Anthropic-compatible**: 
+* /messages 
+
+LiteLLM Proxy is **Vertex AI compatible**:
+- [Supports ALL Vertex Endpoints](../vertex_ai)
+
+This doc covers:
+
+*   /chat/completion
+*   /embedding
+
+
+These are **selected examples**. LiteLLM Proxy is **OpenAI-Compatible**, it works with any project that calls OpenAI. Just change the `base_url`, `api_key` and `model`.
+
+To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)
+
+To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)
+

 :::info

@ -234,6 +270,54 @@ main();
 ```

 </TabItem>
+
+<TabItem value="anthropic-py" label="Anthropic Python SDK">
+
+```python
+import os
+
+from anthropic import Anthropic
+
+client = Anthropic(
+    base_url="http://localhost:4000", # proxy endpoint
+    api_key="sk-s4xN1IiLTCytwtZFJaYQrA", # litellm proxy virtual key
+)
+
+message = client.messages.create(
+    max_tokens=1024,
+    messages=[
+        {
+            "role": "user",
+            "content": "Hello, Claude",
+        }
+    ],
+    model="claude-3-opus-20240229",
+)
+print(message.content)
+```
+
+</TabItem>
+
+<TabItem value="mistral-py" label="Mistral Python SDK">
+
+```python
+import os
+from mistralai.client import MistralClient
+from mistralai.models.chat_completion import ChatMessage
+
+
+client = MistralClient(api_key="sk-1234", endpoint="http://0.0.0.0:4000")
+chat_response = client.chat(
+    model="mistral-small-latest",
+    messages=[
+        {"role": "user", "content": "this is a test request, write a short poem"}
+    ],
+)
+print(chat_response.choices[0].message.content)
+```
+
+</TabItem>
+
 <TabItem value="instructor" label="Instructor">

 ```python
@ -566,6 +650,166 @@ curl --location 'http://0.0.0.0:4000/moderations' \
 ```


+## Using with OpenAI compatible projects
+Set `base_url` to the LiteLLM Proxy server
+
+<Tabs>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
+    {
+        "role": "user",
+        "content": "this is a test request, write a short poem"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="librechat" label="LibreChat">
+
+#### Start the LiteLLM proxy
+```shell
+litellm --model gpt-3.5-turbo
+
+#INFO: Proxy running on http://0.0.0.0:4000
+```
+
+#### 1. Clone the repo
+
+```shell
+git clone https://github.com/danny-avila/LibreChat.git
+```
+
+
+#### 2. Modify Librechat's `docker-compose.yml`
+LiteLLM Proxy is running on port `4000`, set `4000` as the proxy below
+```yaml
+OPENAI_REVERSE_PROXY=http://host.docker.internal:4000/v1/chat/completions
+```
+
+#### 3. Save fake OpenAI key in Librechat's `.env` 
+
+Copy Librechat's `.env.example` to `.env` and overwrite the default OPENAI_API_KEY (by default it requires the user to pass a key).
+```env
+OPENAI_API_KEY=sk-1234
+```
+
+#### 4. Run LibreChat: 
+```shell
+docker compose up
+```
+</TabItem>
+
+<TabItem value="continue-dev" label="ContinueDev">
+
+Continue-Dev brings ChatGPT to VSCode. See how to [install it here](https://continue.dev/docs/quickstart).
+
+In the [config.py](https://continue.dev/docs/reference/Models/openai) set this as your default model.
+```python
+  default=OpenAI(
+      api_key="IGNORED",
+      model="fake-model-name",
+      context_length=2048, # customize if needed for your model
+      api_base="http://localhost:4000" # your proxy server url
+  ),
+```
+
+Credits [@vividfog](https://github.com/ollama/ollama/issues/305#issuecomment-1751848077) for this tutorial. 
+</TabItem>
+
+<TabItem value="aider" label="Aider">
+
+```shell
+$ pip install aider 
+
+$ aider --openai-api-base http://0.0.0.0:4000 --openai-api-key fake-key
+```
+</TabItem>
+<TabItem value="autogen" label="AutoGen">
+
+```python
+pip install pyautogen
+```
+
+```python
+from autogen import AssistantAgent, UserProxyAgent, oai
+config_list=[
+    {
+        "model": "my-fake-model",
+        "api_base": "http://localhost:4000",  #litellm compatible endpoint
+        "api_type": "open_ai",
+        "api_key": "NULL", # just a placeholder
+    }
+]
+
+response = oai.Completion.create(config_list=config_list, prompt="Hi")
+print(response) # works fine
+
+llm_config={
+    "config_list": config_list,
+}
+
+assistant = AssistantAgent("assistant", llm_config=llm_config)
+user_proxy = UserProxyAgent("user_proxy")
+user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stock price change YTD.", config_list=config_list)
+```
+
+Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
+</TabItem>
+
+<TabItem value="guidance" label="guidance">
+A guidance language for controlling large language models.
+https://github.com/guidance-ai/guidance
+
+**NOTE:** Guidance sends additional params like `stop_sequences` which can cause some models to fail if they don't support it. 
+
+**Fix**: Start your proxy using the `--drop_params` flag
+
+```shell
+litellm --model ollama/codellama --temperature 0.3 --max_tokens 2048 --drop_params
+```
+
+```python
+import guidance
+
+# set api_base to your proxy
+# set api_key to anything
+gpt4 = guidance.llms.OpenAI("gpt-4", api_base="http://0.0.0.0:4000", api_key="anything")
+
+experts = guidance('''
+{{#system~}}
+You are a helpful and terse assistant.
+{{~/system}}
+
+{{#user~}}
+I want a response to the following question:
+{{query}}
+Name 3 world-class experts (past or present) who would be great at answering this?
+Don't answer the question yet.
+{{~/user}}
+
+{{#assistant~}}
+{{gen 'expert_names' temperature=0 max_tokens=300}}
+{{~/assistant}}
+''', llm=gpt4)
+
+result = experts(query='How can I be more productive?')
+print(result)
+```
+</TabItem>
+</Tabs>
+
+
 ## Advanced

 ### (BETA) Batch Completions - pass multiple models
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -1,7 +1,7 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server
+# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local LiteLLM Proxy Server

 A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs. 

--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -14,7 +14,7 @@ In production, litellm supports using Redis as a way to track cooldown server an

 :::info

-If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./proxy/load_balancing.md)
+If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./proxy/load_balancing.md)

 :::

@ -1637,7 +1637,7 @@ response = router.completion(

 ## Deploy Router 

-If you want a server to load balance across different LLM APIs, use our [OpenAI Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)
+If you want a server to load balance across different LLM APIs, use our [LiteLLM Proxy Server](./simple_proxy#load-balancing---multiple-instances-of-1-model)


 ## Init Params for the litellm.Router
--- a/docs/my-website/docs/scheduler.md
+++ b/docs/my-website/docs/scheduler.md
@ -147,6 +147,9 @@ model_list:
        mock_response: "hello world!" 
        api_key: my-good-key

+litellm_settings:
+    request_timeout: 600 # 👈 Will keep retrying until timeout occurs
+
 router_settings:
    redis_host; os.environ/REDIS_HOST
    redis_password: os.environ/REDIS_PASSWORD
--- a/docs/my-website/docs/sdk_custom_pricing.md
+++ b/docs/my-website/docs/sdk_custom_pricing.md
@ -0,0 +1,65 @@
+# Custom Pricing - SageMaker, Azure, etc
+
+Register custom pricing for sagemaker completion model. 
+
+For cost per second pricing, you **just** need to register `input_cost_per_second`. 
+
+```python
+# !pip install boto3 
+from litellm import completion, completion_cost 
+
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION_NAME"] = ""
+
+
+def test_completion_sagemaker():
+    try:
+        print("testing sagemaker")
+        response = completion(
+            model="sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            input_cost_per_second=0.000420,
+        )
+        # Add any assertions here to check the response
+        print(response)
+        cost = completion_cost(completion_response=response)
+        print(cost)
+    except Exception as e:
+        raise Exception(f"Error occurred: {e}")
+
+```
+
+
+## Cost Per Token (e.g. Azure)
+
+
+```python
+# !pip install boto3 
+from litellm import completion, completion_cost 
+
+## set ENV variables
+os.environ["AZURE_API_KEY"] = ""
+os.environ["AZURE_API_BASE"] = ""
+os.environ["AZURE_API_VERSION"] = ""
+
+
+def test_completion_azure_model():
+    try:
+        print("testing azure custom pricing")
+        # azure call
+        response = completion(
+          model = "azure/<your_deployment_name>", 
+          messages = [{ "content": "Hello, how are you?","role": "user"}]
+          input_cost_per_token=0.005,
+          output_cost_per_token=1,
+        )
+        # Add any assertions here to check the response
+        print(response)
+        cost = completion_cost(completion_response=response)
+        print(cost)
+    except Exception as e:
+        raise Exception(f"Error occurred: {e}")
+
+test_completion_azure_model()
+```
--- a/docs/my-website/docs/secret.md
+++ b/docs/my-website/docs/secret.md
@ -61,7 +61,7 @@ litellm --config /path/to/config.yaml
 ```

 ## Azure Key Vault
-
+<!-- 
 ### Quick Start

 ```python 
@ -88,9 +88,9 @@ import litellm
 litellm.secret_manager = client

 litellm.get_secret("your-test-key")
-```
+``` -->

-### Usage with OpenAI Proxy Server
+### Usage with LiteLLM Proxy Server

 1. Install Proxy dependencies 
 ```bash
@ -129,7 +129,7 @@ litellm --config /path/to/config.yaml

 Use encrypted keys from Google KMS on the proxy

-### Usage with OpenAI Proxy Server
+### Usage with LiteLLM Proxy Server

 ## Step 1. Add keys to env 
 ```
@ -160,29 +160,6 @@ $ litellm --test

 [Quick Test Proxy](./proxy/quick_start#using-litellm-proxy---curl-request-openai-package-langchain-langchain-js)

-
-## Infisical Secret Manager
-Integrates with [Infisical's Secret Manager](https://infisical.com/) for secure storage and retrieval of API keys and sensitive data.
-
-### Usage
-liteLLM manages reading in your LLM API secrets/env variables from Infisical for you
-
-```python
-import litellm
-from infisical import InfisicalClient
-
-litellm.secret_manager = InfisicalClient(token="your-token")
-
-messages = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "What's the weather like today?"},
-]
-
-response = litellm.completion(model="gpt-3.5-turbo", messages=messages)
-
-print(response)
-```
-
-
+<!-- 
 ## .env Files
-If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data.
+If no secret manager client is specified, Litellm automatically uses the `.env` file to manage sensitive data. -->
--- a/docs/my-website/docs/simple_proxy_old_doc.md
+++ b/docs/my-website/docs/simple_proxy_old_doc.md
@ -2,7 +2,7 @@ import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';

-# 💥 OpenAI Proxy Server
+# 💥 LiteLLM Proxy Server

 LiteLLM Server manages:

--- a/docs/my-website/docs/vertex_ai.md
+++ b/docs/my-website/docs/vertex_ai.md
@ -0,0 +1,93 @@
+# [BETA] Vertex AI Endpoints
+
+## Supported API Endpoints
+
+- Gemini API
+- Embeddings API
+- Imagen API
+- Code Completion API
+- Batch prediction API
+- Tuning API
+- CountTokens API
+
+## Quick Start Usage 
+
+#### 1. Set `default_vertex_config` on your `config.yaml`
+
+
+Add the following credentials to your litellm config.yaml to use the Vertex AI endpoints.
+
+```yaml
+default_vertex_config:
+  vertex_project: "adroit-crow-413218"
+  vertex_location: "us-central1"
+  vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json
+```
+
+#### 2. Start litellm proxy
+
+```shell
+litellm --config /path/to/config.yaml
+```
+
+#### 3. Test it 
+
+```shell
+curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:countTokens \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer sk-1234" \
+-d '{"instances":[{"content": "gm"}]}'
+```
+## Usage Examples
+
+### Gemini API (Generate Content)
+
+```shell
+curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:generateContent \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
+```
+
+### Embeddings API
+
+```shell
+curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:predict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{"instances":[{"content": "gm"}]}'
+```
+
+### Imagen API
+
+```shell
+curl http://localhost:4000/vertex-ai/publishers/google/models/imagen-3.0-generate-001:predict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}'
+```
+
+### Count Tokens API
+
+```shell
+curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:countTokens \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}'
+```
+
+### Tuning API 
+
+Create Fine Tuning Job
+
+```shell
+curl http://localhost:4000/vertex-ai/tuningJobs \
+      -H "Content-Type: application/json" \
+      -H "Authorization: Bearer sk-1234" \
+      -d '{
+  "baseModel": "gemini-1.0-pro-002",
+  "supervisedTuningSpec" : {
+      "training_dataset_uri": "gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl"
+  }
+}'
+```
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -28,6 +28,24 @@ const config = {
  },

  plugins: [
+    [
+      require.resolve("@getcanary/docusaurus-pagefind"),
+      {
+        indexOnly: true,
+        styles: {
+          "--canary-color-primary-c": 0.1,
+          "--canary-color-primary-h": 270,
+        },
+        pagefind: {
+          ranking: {
+            pageLength: 0.9,
+            termFrequency: 1.0,
+            termSimilarity: 1.0,
+            termSaturation: 1.5,
+          }
+        }
+      },
+    ],
    [
      '@docusaurus/plugin-ideal-image',
      {
@ -117,6 +135,11 @@ const config = {
            label: '🚀 Hosted',
            to: "docs/hosted"
          },
+          {
+            href: 'https://models.litellm.ai/',
+            label: '💸 LLM Model Cost Map',
+            position: 'right',
+          },
          {
            href: 'https://github.com/BerriAI/litellm',
            label: 'GitHub',
--- a/docs/my-website/img/gcs_bucket.png
+++ b/docs/my-website/img/gcs_bucket.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
--- a/docs/my-website/package.json
+++ b/docs/my-website/package.json
@ -18,13 +18,14 @@
    "@docusaurus/plugin-google-gtag": "^2.4.1",
    "@docusaurus/plugin-ideal-image": "^2.4.1",
    "@docusaurus/preset-classic": "2.4.1",
+    "@getcanary/docusaurus-pagefind": "^0.0.12",
+    "@getcanary/web": "^0.0.55",
    "@mdx-js/react": "^1.6.22",
    "clsx": "^1.2.1",
    "docusaurus": "^1.14.7",
-    "docusaurus-lunr-search": "^2.4.1",
    "prism-react-renderer": "^1.3.5",
-    "react": "^18.1.0",
-    "react-dom": "^18.1.0",
+    "react": "^17.0.2",
+    "react-dom": "^17.0.2",
    "sharp": "^0.32.6",
    "uuid": "^9.0.1"
  },
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -20,11 +20,11 @@ const sidebars = {
    { type: "doc", id: "index" }, // NEW
    {
      type: "category",
-      label: "💥 OpenAI Proxy Server",
+      label: "💥 LiteLLM Proxy Server",
      link: {
        type: "generated-index",
-        title: "💥 OpenAI Proxy Server",
-        description: `Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
+        title: "💥 LiteLLM Proxy Server",
+        description: `OpenAI Proxy Server to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`,
        slug: "/simple_proxy",
      },
      items: [
@ -42,12 +42,21 @@ const sidebars = {
        "proxy/configs",
        "proxy/reliability",
        "proxy/cost_tracking",
+        "proxy/custom_pricing",
        "proxy/self_serve",
        "proxy/virtual_keys",
        {
          type: "category",
          label: "🪢 Logging",
-          items: ["proxy/logging", "proxy/streaming_logging"],
+          items: ["proxy/logging", "proxy/bucket", "proxy/streaming_logging"],
+        },
+        {
+          type: "category", 
+          label: "Secret Manager - storing LLM API Keys", 
+          items: [
+            "secret", 
+            "oidc"
+          ]
        },
        "proxy/team_logging",
        "proxy/guardrails",
@ -83,49 +92,7 @@ const sidebars = {
    },
    {
      type: "category",
-      label: "Completion()",
-      link: {
-        type: "generated-index",
-        title: "Completion()",
-        description: "Details on the completion() function",
-        slug: "/completion",
-      },
-      items: [
-        "completion/input",
-        "completion/provider_specific_params",
-        "completion/json_mode",
-        "completion/drop_params",
-        "completion/prompt_formatting",
-        "completion/output",
-        "exception_mapping",
-        "completion/stream",
-        "completion/message_trimming",
-        "completion/function_call",
-        "completion/vision",
-        "completion/model_alias",
-        "completion/batching",
-        "completion/mock_requests",
-        "completion/reliable_completions",
-      ],
-    },
-    {
-      type: "category",
-      label: "Embedding(), Image Generation(), Assistants(), Moderation(), Audio Transcriptions(), TTS(), Batches()",
-      items: [
-        "embedding/supported_embedding",
-        "embedding/async_embedding",
-        "embedding/moderation",
-        "image_generation",
-        "audio_transcription",
-        "text_to_speech",
-        "assistants",
-        "batches",
-        "anthropic_completion"
-      ],
-    },
-    {
-      type: "category",
-      label: "Supported Models & Providers",
+      label: "💯 Supported Models & Providers",
      link: {
        type: "generated-index",
        title: "Providers",
@ -160,6 +127,7 @@ const sidebars = {
        "providers/perplexity", 
        "providers/friendliai",
        "providers/groq", 
+        "providers/github", 
        "providers/deepseek", 
        "providers/fireworks_ai",
        "providers/clarifai", 
@ -181,20 +149,68 @@ const sidebars = {
        
      ],
    },
-    "proxy/custom_pricing",
-    "routing",
-    "scheduler",
-    "set_keys",
-    "budget_manager",
    {
      type: "category",
-      label: "Secret Manager", 
+      label: "Chat Completions (litellm.completion)",
+      link: {
+        type: "generated-index",
+        title: "Chat Completions",
+        description: "Details on the completion() function",
+        slug: "/completion",
+      },
      items: [
-        "secret", 
-        "oidc"
-      ]
+        "completion/input",
+        "completion/provider_specific_params",
+        "completion/json_mode",
+        "completion/drop_params",
+        "completion/prompt_formatting",
+        "completion/output",
+        "exception_mapping",
+        "completion/stream",
+        "completion/message_trimming",
+        "completion/function_call",
+        "completion/vision",
+        "completion/model_alias",
+        "completion/batching",
+        "completion/mock_requests",
+        "completion/reliable_completions",
+      ],
+    },
+    {
+      type: "category",
+      label: "Supported Endpoints - /images, /audio/speech, /assistants etc",
+      items: [
+        "embedding/supported_embedding",
+        "embedding/async_embedding",
+        "embedding/moderation",
+        "image_generation",
+        "audio_transcription",
+        "text_to_speech",
+        "assistants",
+        "batches",
+        "fine_tuning",
+        "anthropic_completion",
+        "vertex_ai"
+      ],
+    },
+    {
+      type: "category",
+      label: "🚅 LiteLLM Python SDK",
+      items: [
+        "routing",
+        "scheduler",
+        "set_keys",
+        "completion/token_usage",
+        "sdk_custom_pricing",
+        "budget_manager",
+        "caching/all_caches",
+        {
+          type: "category",
+          label: "LangChain, LlamaIndex, Instructor Integration",
+          items: ["langchain/langchain", "tutorials/instructor"],
+        },
+      ],
    },
-    "completion/token_usage",
    "load_test",
    {
      type: "category",
@ -202,6 +218,7 @@ const sidebars = {
      items: [
        "observability/langfuse_integration",
        "observability/logfire_integration",
+        "observability/gcs_bucket_integration",
        "observability/langsmith_integration",
        "observability/arize_integration",
        "debugging/local_debugging",
@ -224,14 +241,12 @@ const sidebars = {
        `observability/telemetry`,
      ],
    },
-    "caching/all_caches",
    {
      type: "category",
      label: "Tutorials",
      items: [
        'tutorials/azure_openai',
        'tutorials/instructor',
-        'tutorials/oobabooga',
        "tutorials/gradio_integration",
        "tutorials/huggingface_codellama",
        "tutorials/huggingface_tutorial",
@ -243,11 +258,6 @@ const sidebars = {
        "tutorials/model_fallbacks",
      ],
    },
-    {
-      type: "category",
-      label: "LangChain, LlamaIndex, Instructor Integration",
-      items: ["langchain/langchain", "tutorials/instructor"],
-    },
    {
      type: "category",
      label: "Extras",
--- a/docs/my-website/src/pages/index.md
+++ b/docs/my-website/src/pages/index.md
@ -10,7 +10,7 @@ https://github.com/BerriAI/litellm
 - Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
 - [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
 - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Track spend & set budgets per project [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
+- Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy)

 ## Basic usage

--- a/docs/my-website/src/pages/stream.md
+++ b/docs/my-website/src/pages/stream.md
@ -31,3 +31,47 @@ response = asyncio.run(test_get_response())
 print(response)

 ```
+
+## Streaming Token Usage 
+
+Supported across all providers. Works the same as openai. 
+
+`stream_options={"include_usage": True}`
+
+If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
+
+### SDK 
+```python 
+from litellm import completion 
+import os
+
+os.environ["OPENAI_API_KEY"] = "" 
+
+response = completion(model="gpt-3.5-turbo", messages=messages, stream=True, stream_options={"include_usage": True})
+for chunk in response:
+    print(chunk['choices'][0]['delta'])
+```
+
+### PROXY
+
+```bash 
+curl https://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -d '{
+    "model": "gpt-4o",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ],
+    "stream": true,
+    "stream_options": {"include_usage": true}
+  }'
+
+```
--- a/docs/my-website/src/theme/SearchBar.js
+++ b/docs/my-website/src/theme/SearchBar.js
@ -0,0 +1,95 @@
+import React from "react";
+import SearchBar from "@theme-original/SearchBar";
+
+import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
+import { usePluginData } from "@docusaurus/useGlobalData";
+
+export default function SearchBarWrapper(props) {
+  const { siteConfig } = useDocusaurusContext();
+  const { options } = usePluginData("docusaurus-plugin-pagefind-canary");
+
+  const [path, setPath] = React.useState("");
+  const [loaded, setLoaded] = React.useState(false);
+
+  React.useEffect(() => {
+    setPath(`${siteConfig.baseUrl}pagefind/pagefind.js`);
+  }, [siteConfig]);
+
+  React.useEffect(() => {
+    Promise.all([
+      import("@getcanary/web/components/canary-root"),
+      import("@getcanary/web/components/canary-provider-pagefind"),
+      import("@getcanary/web/components/canary-modal"),
+      import("@getcanary/web/components/canary-trigger-logo"),
+      import("@getcanary/web/components/canary-content"),
+      import("@getcanary/web/components/canary-search"),
+      import("@getcanary/web/components/canary-search-input"),
+      import("@getcanary/web/components/canary-search-results-group"),
+      import("@getcanary/web/components/canary-footer"),
+      import("@getcanary/web/components/canary-callout-calendly"),
+      import("@getcanary/web/components/canary-callout-discord"),
+    ])
+      .then(() => setLoaded(true))
+      .catch(console.error);
+  }, []);
+
+  return (
+    <div
+      style={{
+        display: "flex",
+        flexDirection: "row",
+        alignItems: "center",
+        gap: "6px",
+      }}
+    >
+      {!loaded || !path ? (
+        <button
+          style={{
+            fontSize: "2rem",
+            backgroundColor: "transparent",
+            border: "none",
+            outline: "none",
+            padding: "0",
+            marginRight: "6px",
+          }}
+        >
+          🐤
+        </button>
+      ) : (
+        <canary-root framework="docusaurus">
+          <canary-provider-pagefind
+            options={JSON.stringify({ ...options, path })}
+          >
+            <canary-modal>
+              <canary-trigger-logo slot="trigger"></canary-trigger-logo>
+              <canary-content slot="content">
+                <canary-search slot="search">
+                  <canary-search-input slot="input"></canary-search-input>
+                  <canary-search-results-group
+                    slot="results"
+                    groups="SDK:*;Proxy:/docs/(simple_proxy|proxy/.*)"
+                  ></canary-search-results-group>
+                  <canary-callout-discord
+                    slot="callout"
+                    message="👋 Looking for help?"
+                    url="https://discord.com/invite/wuPM9dRgDw"
+                    keywords="discord,help,support,community"
+                  ></canary-callout-discord>
+                  <canary-callout-calendly
+                    slot="callout"
+                    message="🚅 Interested in enterprise features?"
+                    keywords="sso,enterprise,security,audit"
+                    url="https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
+                  ></canary-callout-calendly>
+                </canary-search>
+                <canary-footer slot="footer"></canary-footer>
+              </canary-content>
+            </canary-modal>
+          </canary-provider-pagefind>
+        </canary-root>
+      )}
+
+      <SearchBar {...props} />
+    </div>
+  );
+}
--- a/docs/my-website/yarn.lock
+++ b/docs/my-website/yarn.lock
--- a/enterprise/enterprise_hooks/lakera_ai.py
+++ b/enterprise/enterprise_hooks/lakera_ai.py
@ -138,11 +138,24 @@ class _ENTERPRISE_lakeraAI_Moderation(CustomLogger):
            return
        text = ""
        if "messages" in data and isinstance(data["messages"], list):
-            enabled_roles = litellm.guardrail_name_config_map[
-                "prompt_injection"
-            ].enabled_roles
+            prompt_injection_obj: Optional[GuardrailItem] = (
+                litellm.guardrail_name_config_map.get("prompt_injection")
+            )
+            if prompt_injection_obj is not None:
+                enabled_roles = prompt_injection_obj.enabled_roles
+            else:
+                enabled_roles = None
+
            if enabled_roles is None:
                enabled_roles = default_roles
+
+            stringified_roles: List[str] = []
+            if enabled_roles is not None:  # convert to list of str
+                for role in enabled_roles:
+                    if isinstance(role, Role):
+                        stringified_roles.append(role.value)
+                    elif isinstance(role, str):
+                        stringified_roles.append(role)
            lakera_input_dict: Dict = {
                role: None for role in INPUT_POSITIONING_MAP.keys()
            }
@ -150,7 +163,7 @@ class _ENTERPRISE_lakeraAI_Moderation(CustomLogger):
            tool_call_messages: List = []
            for message in data["messages"]:
                role = message.get("role")
-                if role in enabled_roles:
+                if role in stringified_roles:
                    if "tool_calls" in message:
                        tool_call_messages = [
                            *tool_call_messages,
--- a/index.yaml
+++ b/index.yaml
@ -2,8 +2,8 @@ apiVersion: v1
 entries:
  litellm-helm:
  - apiVersion: v2
-    appVersion: v1.41.8
-    created: "2024-07-10T00:59:11.1889+08:00"
+    appVersion: v1.42.7
+    created: "2024-08-01T12:25:58.808699+08:00"
    dependencies:
    - condition: db.deployStandalone
      name: postgresql
@ -14,31 +14,12 @@ entries:
      repository: oci://registry-1.docker.io/bitnamicharts
      version: '>=18.0.0'
    description: Call all LLM APIs using the OpenAI format
-    digest: eeff5e4e6cebb4c977cb7359c1ec6c773c66982f6aa39dbed94a674890144a43
+    digest: b1de8fa444a37410e223a3d1bd3cc2120f3f22204005fcb61e701c0c7db95d86
    name: litellm-helm
    type: application
    urls:
-    - https://berriai.github.io/litellm/litellm-helm-0.2.1.tgz
-    version: 0.2.1
-  - apiVersion: v2
-    appVersion: v1.35.38
-    created: "2024-05-06T10:22:24.384392-07:00"
-    dependencies:
-    - condition: db.deployStandalone
-      name: postgresql
-      repository: oci://registry-1.docker.io/bitnamicharts
-      version: '>=13.3.0'
-    - condition: redis.enabled
-      name: redis
-      repository: oci://registry-1.docker.io/bitnamicharts
-      version: '>=18.0.0'
-    description: Call all LLM APIs using the OpenAI format
-    digest: 60f0cfe9e7c1087437cb35f6fb7c43c3ab2be557b6d3aec8295381eb0dfa760f
-    name: litellm-helm
-    type: application
-    urls:
-    - litellm-helm-0.2.0.tgz
-    version: 0.2.0
+    - https://berriai.github.io/litellm/litellm-helm-0.2.2.tgz
+    version: 0.2.2
  postgresql:
  - annotations:
      category: Database
@ -52,7 +33,7 @@ entries:
      licenses: Apache-2.0
    apiVersion: v2
    appVersion: 16.2.0
-    created: "2024-07-10T00:59:11.191731+08:00"
+    created: "2024-08-01T12:25:58.812033+08:00"
    dependencies:
    - name: common
      repository: oci://registry-1.docker.io/bitnamicharts
@ -98,7 +79,7 @@ entries:
      licenses: Apache-2.0
    apiVersion: v2
    appVersion: 7.2.4
-    created: "2024-07-10T00:59:11.195667+08:00"
+    created: "2024-08-01T12:25:58.816784+08:00"
    dependencies:
    - name: common
      repository: oci://registry-1.docker.io/bitnamicharts
@ -124,4 +105,4 @@ entries:
    urls:
    - https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
    version: 18.19.1
-generated: "2024-07-10T00:59:11.179952+08:00"
+generated: "2024-08-01T12:25:58.800261+08:00"
--- a/litellm-helm-0.2.2.tgz
+++ b/litellm-helm-0.2.2.tgz
--- a/litellm/init.py
+++ b/litellm/init.py
@ -46,6 +46,7 @@ _custom_logger_compatible_callbacks_literal = Literal[
    "galileo",
    "braintrust",
    "arize",
+    "gcs_bucket",
 ]
 _known_custom_logger_compatible_callbacks: List = list(
    get_args(_custom_logger_compatible_callbacks_literal)
@ -145,6 +146,9 @@ return_response_headers: bool = (
 )
 ##################
 logging: bool = True
+enable_caching_on_provider_specific_optional_params: bool = (
+    False  # feature-flag for caching on optional params - e.g. 'top_k'
+)
 caching: bool = (
    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
 )
@ -165,6 +169,7 @@ budget_duration: Optional[str] = (
 default_soft_budget: float = (
    50.0  # by default all litellm proxy keys have a soft budget of 50.0
 )
+forward_traceparent_to_llm_provider: bool = False
 _openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
 _openai_completion_params = [
    "functions",
@ -266,7 +271,7 @@ default_fallbacks: Optional[List] = None
 fallbacks: Optional[List] = None
 context_window_fallbacks: Optional[List] = None
 content_policy_fallbacks: Optional[List] = None
-allowed_fails: int = 0
+allowed_fails: int = 3
 num_retries_per_request: Optional[int] = (
    None  # for the request overall (incl. fallbacks + model retries)
 )
@ -358,6 +363,7 @@ vertex_code_text_models: List = []
 vertex_embedding_models: List = []
 vertex_anthropic_models: List = []
 vertex_llama3_models: List = []
+vertex_mistral_models: List = []
 ai21_models: List = []
 nlp_cloud_models: List = []
 aleph_alpha_models: List = []
@ -403,6 +409,9 @@ for key, value in model_cost.items():
    elif value.get("litellm_provider") == "vertex_ai-llama_models":
        key = key.replace("vertex_ai/", "")
        vertex_llama3_models.append(key)
+    elif value.get("litellm_provider") == "vertex_ai-mistral_models":
+        key = key.replace("vertex_ai/", "")
+        vertex_mistral_models.append(key)
    elif value.get("litellm_provider") == "ai21":
        ai21_models.append(key)
    elif value.get("litellm_provider") == "nlp_cloud":
@ -452,6 +461,7 @@ openai_compatible_providers: List = [
    "empower",
    "friendliai",
    "azure_ai",
+    "github",
 ]


@ -692,6 +702,7 @@ provider_list: List = [
    "predibase",
    "databricks",
    "empower",
+    "github",
    "custom",  # custom apis
 ]

@ -809,9 +820,19 @@ from .utils import (
    ModelResponse,
    EmbeddingResponse,
    ImageResponse,
+    TranscriptionResponse,
+    TextCompletionResponse,
    get_provider_fields,
 )

+ALL_LITELLM_RESPONSE_TYPES = [
+    ModelResponse,
+    EmbeddingResponse,
+    ImageResponse,
+    TranscriptionResponse,
+    TextCompletionResponse,
+]
+
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
@ -833,7 +854,7 @@ from .llms.petals import PetalsConfig
 from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
 from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
-from .llms.vertex_ai_llama import VertexAILlama3Config
+from .llms.vertex_ai_partner import VertexAILlama3Config
 from .llms.sagemaker import SagemakerConfig
 from .llms.ollama import OllamaConfig
 from .llms.ollama_chat import OllamaChatConfig
@ -902,6 +923,7 @@ from .proxy.proxy_cli import run_server
 from .router import Router
 from .assistants.main import *
 from .batches.main import *
+from .fine_tuning.main import *
 from .files.main import *
 from .scheduler import *
 from .cost_calculator import response_cost_calculator, cost_per_token
--- a/litellm/_service_logger.py
+++ b/litellm/_service_logger.py
@ -56,6 +56,7 @@ class ServiceLogging(CustomLogger):
        parent_otel_span: Optional[Span] = None,
        start_time: Optional[Union[datetime, float]] = None,
        end_time: Optional[Union[datetime, float]] = None,
+        event_metadata: Optional[dict] = None,
    ):
        """
        - For counting if the redis, postgres call is successful
@ -84,6 +85,7 @@ class ServiceLogging(CustomLogger):
                        parent_otel_span=parent_otel_span,
                        start_time=start_time,
                        end_time=end_time,
+                        event_metadata=event_metadata,
                    )

    async def async_service_failure_hook(
@ -95,6 +97,7 @@ class ServiceLogging(CustomLogger):
        parent_otel_span: Optional[Span] = None,
        start_time: Optional[Union[datetime, float]] = None,
        end_time: Optional[Union[float, datetime]] = None,
+        event_metadata: Optional[dict] = None,
    ):
        """
        - For counting if the redis, postgres call is unsuccessful
@ -125,12 +128,16 @@ class ServiceLogging(CustomLogger):

        from litellm.proxy.proxy_server import open_telemetry_logger

-        if parent_otel_span is not None and open_telemetry_logger is not None:
+        if not isinstance(error, str):
+            error = str(error)
+        if open_telemetry_logger is not None:
            await open_telemetry_logger.async_service_failure_hook(
                payload=payload,
                parent_otel_span=parent_otel_span,
                start_time=start_time,
                end_time=end_time,
+                event_metadata=event_metadata,
+                error=error,
            )

    async def async_post_call_failure_hook(
--- a/litellm/adapters/anthropic_adapter.py
+++ b/litellm/adapters/anthropic_adapter.py
@ -4,7 +4,7 @@ import json
 import os
 import traceback
 import uuid
-from typing import Literal, Optional
+from typing import Any, Literal, Optional

 import dotenv
 import httpx
@ -13,7 +13,12 @@ from pydantic import BaseModel
 import litellm
 from litellm import ChatCompletionRequest, verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
-from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
+from litellm.types.llms.anthropic import (
+    AnthropicMessagesRequest,
+    AnthropicResponse,
+    ContentBlockDelta,
+)
+from litellm.types.utils import AdapterCompletionStreamWrapper


 class AnthropicAdapter(CustomLogger):
@ -43,8 +48,150 @@ class AnthropicAdapter(CustomLogger):
            response=response
        )

-    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
-        return super().translate_completion_output_params_streaming()
+    def translate_completion_output_params_streaming(
+        self, completion_stream: Any
+    ) -> AdapterCompletionStreamWrapper | None:
+        return AnthropicStreamWrapper(completion_stream=completion_stream)


 anthropic_adapter = AnthropicAdapter()
+
+
+class AnthropicStreamWrapper(AdapterCompletionStreamWrapper):
+    """
+    - first chunk return 'message_start'
+    - content block must be started and stopped
+    - finish_reason must map exactly to anthropic reason, else anthropic client won't be able to parse it.
+    """
+
+    sent_first_chunk: bool = False
+    sent_content_block_start: bool = False
+    sent_content_block_finish: bool = False
+    sent_last_message: bool = False
+    holding_chunk: Optional[Any] = None
+
+    def __next__(self):
+        try:
+            if self.sent_first_chunk is False:
+                self.sent_first_chunk = True
+                return {
+                    "type": "message_start",
+                    "message": {
+                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
+                        "type": "message",
+                        "role": "assistant",
+                        "content": [],
+                        "model": "claude-3-5-sonnet-20240620",
+                        "stop_reason": None,
+                        "stop_sequence": None,
+                        "usage": {"input_tokens": 25, "output_tokens": 1},
+                    },
+                }
+            if self.sent_content_block_start is False:
+                self.sent_content_block_start = True
+                return {
+                    "type": "content_block_start",
+                    "index": 0,
+                    "content_block": {"type": "text", "text": ""},
+                }
+
+            for chunk in self.completion_stream:
+                if chunk == "None" or chunk is None:
+                    raise Exception
+
+                processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
+                    response=chunk
+                )
+                if (
+                    processed_chunk["type"] == "message_delta"
+                    and self.sent_content_block_finish is False
+                ):
+                    self.holding_chunk = processed_chunk
+                    self.sent_content_block_finish = True
+                    return {
+                        "type": "content_block_stop",
+                        "index": 0,
+                    }
+                elif self.holding_chunk is not None:
+                    return_chunk = self.holding_chunk
+                    self.holding_chunk = processed_chunk
+                    return return_chunk
+                else:
+                    return processed_chunk
+            if self.holding_chunk is not None:
+                return_chunk = self.holding_chunk
+                self.holding_chunk = None
+                return return_chunk
+            if self.sent_last_message is False:
+                self.sent_last_message = True
+                return {"type": "message_stop"}
+            raise StopIteration
+        except StopIteration:
+            if self.sent_last_message is False:
+                self.sent_last_message = True
+                return {"type": "message_stop"}
+            raise StopIteration
+        except Exception as e:
+            verbose_logger.error(
+                "Anthropic Adapter - {}\n{}".format(e, traceback.format_exc())
+            )
+
+    async def __anext__(self):
+        try:
+            if self.sent_first_chunk is False:
+                self.sent_first_chunk = True
+                return {
+                    "type": "message_start",
+                    "message": {
+                        "id": "msg_1nZdL29xx5MUA1yADyHTEsnR8uuvGzszyY",
+                        "type": "message",
+                        "role": "assistant",
+                        "content": [],
+                        "model": "claude-3-5-sonnet-20240620",
+                        "stop_reason": None,
+                        "stop_sequence": None,
+                        "usage": {"input_tokens": 25, "output_tokens": 1},
+                    },
+                }
+            if self.sent_content_block_start is False:
+                self.sent_content_block_start = True
+                return {
+                    "type": "content_block_start",
+                    "index": 0,
+                    "content_block": {"type": "text", "text": ""},
+                }
+            async for chunk in self.completion_stream:
+                if chunk == "None" or chunk is None:
+                    raise Exception
+                processed_chunk = litellm.AnthropicConfig().translate_streaming_openai_response_to_anthropic(
+                    response=chunk
+                )
+                if (
+                    processed_chunk["type"] == "message_delta"
+                    and self.sent_content_block_finish is False
+                ):
+                    self.holding_chunk = processed_chunk
+                    self.sent_content_block_finish = True
+                    return {
+                        "type": "content_block_stop",
+                        "index": 0,
+                    }
+                elif self.holding_chunk is not None:
+                    return_chunk = self.holding_chunk
+                    self.holding_chunk = processed_chunk
+                    return return_chunk
+                else:
+                    return processed_chunk
+            if self.holding_chunk is not None:
+                return_chunk = self.holding_chunk
+                self.holding_chunk = None
+                return return_chunk
+            if self.sent_last_message is False:
+                self.sent_last_message = True
+                return {"type": "message_stop"}
+            raise StopIteration
+        except StopIteration:
+            if self.sent_last_message is False:
+                self.sent_last_message = True
+                return {"type": "message_stop"}
+            raise StopAsyncIteration
--- a/litellm/batches/main.py
+++ b/litellm/batches/main.py
@ -20,10 +20,8 @@ import httpx

 import litellm
 from litellm import client
-from litellm.utils import supports_httpx_timeout
-
-from ..llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
-from ..types.llms.openai import (
+from litellm.llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
+from litellm.types.llms.openai import (
    Batch,
    CancelBatchRequest,
    CreateBatchRequest,
@ -34,7 +32,8 @@ from ..types.llms.openai import (
    HttpxBinaryResponseContent,
    RetrieveBatchRequest,
 )
-from ..types.router import *
+from litellm.types.router import GenericLiteLLMParams
+from litellm.utils import supports_httpx_timeout

 ####### ENVIRONMENT VARIABLES ###################
 openai_batches_instance = OpenAIBatchesAPI()
@ -314,17 +313,135 @@ def retrieve_batch(
        raise e


-def cancel_batch():
+async def alist_batches(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Batch:
+    """
+    Async: List your organization's batches.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["alist_batches"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            list_batches,
+            after,
+            limit,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+
+        return response
+    except Exception as e:
+        raise e
+
+
+def list_batches(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Lists batches
+
+    List your organization's batches.
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            ### TIMEOUT LOGIC ###
+            timeout = (
+                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+            )
+            # set timeout for 10 minutes by default
+
+            if (
+                timeout is not None
+                and isinstance(timeout, httpx.Timeout)
+                and supports_httpx_timeout(custom_llm_provider) == False
+            ):
+                read_timeout = timeout.read or 600
+                timeout = read_timeout  # default 10 min timeout
+            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+                timeout = float(timeout)  # type: ignore
+            elif timeout is None:
+                timeout = 600.0
+
+            _is_async = kwargs.pop("alist_batches", False) is True
+
+            response = openai_batches_instance.list_batches(
+                _is_async=_is_async,
+                after=after,
+                limit=limit,
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
    pass


-def list_batch():
+def cancel_batch():
    pass


 async def acancel_batch():
    pass
-
-
-async def alist_batch():
-    pass
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -10,6 +10,7 @@
 import ast
 import asyncio
 import hashlib
+import io
 import json
 import logging
 import time
@ -21,7 +22,9 @@ from openai._models import BaseModel as OpenAIObject

 import litellm
 from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes
+from litellm.types.utils import all_litellm_params


 def print_verbose(print_statement):
@ -33,16 +36,6 @@ def print_verbose(print_statement):
        pass


-def _get_parent_otel_span_from_kwargs(kwargs: Optional[dict] = None):
-    try:
-        if kwargs is None:
-            return None
-        _metadata = kwargs.get("metadata") or {}
-        return _metadata.get("litellm_parent_otel_span")
-    except:
-        return None
-
-
 class BaseCache:
    def set_cache(self, key, value, **kwargs):
        raise NotImplementedError
@ -1701,6 +1694,8 @@ class Cache:
                    "aembedding",
                    "atranscription",
                    "transcription",
+                    "atext_completion",
+                    "text_completion",
                ]
            ]
        ] = [
@ -1710,6 +1705,8 @@ class Cache:
            "aembedding",
            "atranscription",
            "transcription",
+            "atext_completion",
+            "text_completion",
        ],
        # s3 Bucket, boto3 configuration
        s3_bucket_name: Optional[str] = None,
@ -1843,6 +1840,7 @@ class Cache:
            "seed",
            "tools",
            "tool_choice",
+            "stream",
        ]
        embedding_only_kwargs = [
            "input",
@ -1856,9 +1854,9 @@ class Cache:
        combined_kwargs = (
            completion_kwargs + embedding_only_kwargs + transcription_only_kwargs
        )
-        for param in combined_kwargs:
-            # ignore litellm params here
-            if param in kwargs:
+        litellm_param_kwargs = all_litellm_params
+        for param in kwargs:
+            if param in combined_kwargs:
                # check if param == model and model_group is passed in, then override model with model_group
                if param == "model":
                    model_group = None
@ -1888,21 +1886,33 @@ class Cache:
                        caching_group or model_group or kwargs[param]
                    )  # use caching_group, if set then model_group if it exists, else use kwargs["model"]
                elif param == "file":
-                    metadata_file_name = kwargs.get("metadata", {}).get(
-                        "file_name", None
+                    file = kwargs.get("file")
+                    metadata = kwargs.get("metadata", {})
+                    litellm_params = kwargs.get("litellm_params", {})
+
+                    # get checksum of file content
+                    param_value = (
+                        metadata.get("file_checksum")
+                        or getattr(file, "name", None)
+                        or metadata.get("file_name")
+                        or litellm_params.get("file_name")
                    )
-                    litellm_params_file_name = kwargs.get("litellm_params", {}).get(
-                        "file_name", None
-                    )
-                    if metadata_file_name is not None:
-                        param_value = metadata_file_name
-                    elif litellm_params_file_name is not None:
-                        param_value = litellm_params_file_name
                else:
                    if kwargs[param] is None:
                        continue  # ignore None params
                    param_value = kwargs[param]
                cache_key += f"{str(param)}: {str(param_value)}"
+            elif (
+                param not in litellm_param_kwargs
+            ):  # check if user passed in optional param - e.g. top_k
+                if (
+                    litellm.enable_caching_on_provider_specific_optional_params is True
+                ):  # feature flagged for now
+                    if kwargs[param] is None:
+                        continue  # ignore None params
+                    param_value = kwargs[param]
+                    cache_key += f"{str(param)}: {str(param_value)}"
+
        print_verbose(f"\nCreated cache key: {cache_key}")
        # Use hashlib to create a sha256 hash of the cache key
        hash_object = hashlib.sha256(cache_key.encode())
@ -2107,9 +2117,7 @@ class Cache:
        try:
            cache_list = []
            for idx, i in enumerate(kwargs["input"]):
-                preset_cache_key = litellm.cache.get_cache_key(
-                    *args, **{**kwargs, "input": i}
-                )
+                preset_cache_key = self.get_cache_key(*args, **{**kwargs, "input": i})
                kwargs["cache_key"] = preset_cache_key
                embedding_response = result.data[idx]
                cache_key, cached_data, kwargs = self._add_cache_logic(
@ -2244,6 +2252,8 @@ def enable_cache(
                "aembedding",
                "atranscription",
                "transcription",
+                "atext_completion",
+                "text_completion",
            ]
        ]
    ] = [
@ -2253,6 +2263,8 @@ def enable_cache(
        "aembedding",
        "atranscription",
        "transcription",
+        "atext_completion",
+        "text_completion",
    ],
    **kwargs,
 ):
@ -2309,6 +2321,8 @@ def update_cache(
                "aembedding",
                "atranscription",
                "transcription",
+                "atext_completion",
+                "text_completion",
            ]
        ]
    ] = [
@ -2318,6 +2332,8 @@ def update_cache(
        "aembedding",
        "atranscription",
        "transcription",
+        "atext_completion",
+        "text_completion",
    ],
    **kwargs,
 ):
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -106,7 +106,6 @@ def cost_per_token(
    Returns:
        tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
    """
-    args = locals()
    if model is None:
        raise Exception("Invalid arg. Model cannot be none.")
    ## CUSTOM PRICING ##
@ -117,6 +116,7 @@ def cost_per_token(
        custom_cost_per_second=custom_cost_per_second,
        custom_cost_per_token=custom_cost_per_token,
    )
+
    if response_cost is not None:
        return response_cost[0], response_cost[1]

@ -495,9 +495,9 @@ def completion_cost(
            completion_tokens = completion_response.get("usage", {}).get(
                "completion_tokens", 0
            )
-            total_time = completion_response.get("_response_ms", 0)
+            total_time = getattr(completion_response, "_response_ms", 0)
            verbose_logger.debug(
-                f"completion_response response ms: {completion_response.get('_response_ms')} "
+                f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
            )
            model = model or completion_response.get(
                "model", None
@ -509,7 +509,7 @@ def completion_cost(
                ):
                    model = completion_response._hidden_params.get("model", model)
                custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", ""
+                    "custom_llm_provider", custom_llm_provider or ""
                )
                region_name = completion_response._hidden_params.get(
                    "region_name", region_name
@ -659,9 +659,7 @@ def completion_cost(
            call_type=call_type,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
-        print_verbose(
-            f"final cost: {_final_cost}; prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}; completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
-        )
+
        return _final_cost
    except Exception as e:
        raise e
@ -732,14 +730,21 @@ def response_cost_calculator(
                )
        return response_cost
    except litellm.NotFoundError as e:
-        print_verbose(
+        verbose_logger.debug(  # debug since it can be spammy in logs, for calls
            f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map."
        )
        return None
    except Exception as e:
-        verbose_logger.warning(
-            "litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
-                str(e), traceback.format_exc()
+        if litellm.suppress_debug_info:  # allow cli tools to suppress this information.
+            verbose_logger.debug(
+                "litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
+                    str(e), traceback.format_exc()
+                )
+            )
+        else:
+            verbose_logger.warning(
+                "litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format(
+                    str(e), traceback.format_exc()
+                )
            )
-        )
        return None
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -122,7 +122,7 @@ class BadRequestError(openai.BadRequestError):  # type: ignore
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
-        response = response or httpx.Response(
+        response = httpx.Response(
            status_code=self.status_code,
            request=httpx.Request(
                method="GET", url="https://litellm.ai"
@ -199,8 +199,12 @@ class Timeout(openai.APITimeoutError):  # type: ignore
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
+        headers: Optional[dict] = None,
    ):
-        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        request = httpx.Request(
+            method="POST",
+            url="https://api.openai.com/v1",
+        )
        super().__init__(
            request=request
        )  # Call the base class constructor with the parameters it needs
@ -211,6 +215,7 @@ class Timeout(openai.APITimeoutError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
+        self.headers = headers

    # custom function to convert to str
    def __str__(self):
@ -287,16 +292,13 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
-        if response is None:
-            self.response = httpx.Response(
-                status_code=429,
-                request=httpx.Request(
-                    method="POST",
-                    url=" https://cloud.google.com/vertex-ai/",
-                ),
-            )
-        else:
-            self.response = response
+        self.response = httpx.Response(
+            status_code=429,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
        super().__init__(
            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
@ -334,7 +336,7 @@ class ContextWindowExceededError(BadRequestError):  # type: ignore
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
-        self.response = response or httpx.Response(status_code=400, request=request)
+        self.response = httpx.Response(status_code=400, request=request)
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
@ -377,7 +379,7 @@ class RejectedRequestError(BadRequestError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.request_data = request_data
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
-        response = httpx.Response(status_code=500, request=request)
+        response = httpx.Response(status_code=400, request=request)
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
@ -419,7 +421,7 @@ class ContentPolicyViolationError(BadRequestError):  # type: ignore
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
-        self.response = response or httpx.Response(status_code=500, request=request)
+        self.response = httpx.Response(status_code=400, request=request)
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
@ -463,16 +465,13 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
-        if response is None:
-            self.response = httpx.Response(
-                status_code=self.status_code,
-                request=httpx.Request(
-                    method="POST",
-                    url=" https://cloud.google.com/vertex-ai/",
-                ),
-            )
-        else:
-            self.response = response
+        self.response = httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
        super().__init__(
            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
@ -512,16 +511,13 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        self.litellm_debug_info = litellm_debug_info
        self.max_retries = max_retries
        self.num_retries = num_retries
-        if response is None:
-            self.response = httpx.Response(
-                status_code=self.status_code,
-                request=httpx.Request(
-                    method="POST",
-                    url=" https://cloud.google.com/vertex-ai/",
-                ),
-            )
-        else:
-            self.response = response
+        self.response = httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
        super().__init__(
            self.message, response=self.response, body=None
        )  # Call the base class constructor with the parameters it needs
@ -547,7 +543,7 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
 class APIError(openai.APIError):  # type: ignore
    def __init__(
        self,
-        status_code,
+        status_code: int,
        message,
        llm_provider,
        model,
@ -591,7 +587,7 @@ class APIConnectionError(openai.APIConnectionError):  # type: ignore
        message,
        llm_provider,
        model,
-        request: httpx.Request,
+        request: Optional[httpx.Request] = None,
        litellm_debug_info: Optional[str] = None,
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
@ -601,9 +597,10 @@ class APIConnectionError(openai.APIConnectionError):  # type: ignore
        self.model = model
        self.status_code = 500
        self.litellm_debug_info = litellm_debug_info
+        self.request = httpx.Request(method="POST", url="https://api.openai.com/v1")
        self.max_retries = max_retries
        self.num_retries = num_retries
-        super().__init__(message=self.message, request=request)
+        super().__init__(message=self.message, request=self.request)

    def __str__(self):
        _message = self.message
@ -757,7 +754,7 @@ class MockException(openai.APIError):
    # used for testing
    def __init__(
        self,
-        status_code,
+        status_code: int,
        message,
        llm_provider,
        model,
--- a/litellm/files/main.py
+++ b/litellm/files/main.py
@ -14,7 +14,8 @@ from typing import Any, Coroutine, Dict, Literal, Optional, Union
 import httpx

 import litellm
-from litellm import client
+from litellm import client, get_secret
+from litellm.llms.files_apis.azure import AzureOpenAIFilesAPI
 from litellm.llms.openai import FileDeleted, FileObject, OpenAIFilesAPI
 from litellm.types.llms.openai import (
    Batch,
@ -28,12 +29,13 @@ from litellm.utils import supports_httpx_timeout

 ####### ENVIRONMENT VARIABLES ###################
 openai_files_instance = OpenAIFilesAPI()
+azure_files_instance = AzureOpenAIFilesAPI()
 #################################################


 async def afile_retrieve(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -73,7 +75,7 @@ async def afile_retrieve(

 def file_retrieve(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -156,7 +158,7 @@ def file_retrieve(
 # Delete file
 async def afile_delete(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -196,7 +198,7 @@ async def afile_delete(

 def file_delete(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -208,6 +210,22 @@ def file_delete(
    """
    try:
        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) == False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("is_async", False) is True
        if custom_llm_provider == "openai":
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
@ -229,26 +247,6 @@ def file_delete(
                or litellm.openai_key
                or os.getenv("OPENAI_API_KEY")
            )
-            ### TIMEOUT LOGIC ###
-            timeout = (
-                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-            )
-            # set timeout for 10 minutes by default
-
-            if (
-                timeout is not None
-                and isinstance(timeout, httpx.Timeout)
-                and supports_httpx_timeout(custom_llm_provider) == False
-            ):
-                read_timeout = timeout.read or 600
-                timeout = read_timeout  # default 10 min timeout
-            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
-                timeout = float(timeout)  # type: ignore
-            elif timeout is None:
-                timeout = 600.0
-
-            _is_async = kwargs.pop("is_async", False) is True
-
            response = openai_files_instance.delete_file(
                file_id=file_id,
                _is_async=_is_async,
@ -258,6 +256,38 @@ def file_delete(
                max_retries=optional_params.max_retries,
                organization=organization,
            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+            )  # type: ignore
+
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret("AZURE_OPENAI_API_KEY")
+                or get_secret("AZURE_API_KEY")
+            )  # type: ignore
+
+            extra_body = optional_params.get("extra_body", {})
+            azure_ad_token: Optional[str] = None
+            if extra_body is not None:
+                azure_ad_token = extra_body.pop("azure_ad_token", None)
+            else:
+                azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+
+            response = azure_files_instance.delete_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                file_id=file_id,
+            )
        else:
            raise litellm.exceptions.BadRequestError(
                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
@ -278,7 +308,7 @@ def file_delete(

 # List files
 async def afile_list(
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    purpose: Optional[str] = None,
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
@ -318,7 +348,7 @@ async def afile_list(


 def file_list(
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    purpose: Optional[str] = None,
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
@ -402,7 +432,7 @@ def file_list(
 async def acreate_file(
    file: FileTypes,
    purpose: Literal["assistants", "batch", "fine-tune"],
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -444,7 +474,7 @@ async def acreate_file(
 def create_file(
    file: FileTypes,
    purpose: Literal["assistants", "batch", "fine-tune"],
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -455,7 +485,31 @@ def create_file(
    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
    """
    try:
+        _is_async = kwargs.pop("acreate_file", False) is True
        optional_params = GenericLiteLLMParams(**kwargs)
+
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) == False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+
+        _create_file_request = CreateFileRequest(
+            file=file,
+            purpose=purpose,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
        if custom_llm_provider == "openai":
            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
            api_base = (
@ -477,32 +531,6 @@ def create_file(
                or litellm.openai_key
                or os.getenv("OPENAI_API_KEY")
            )
-            ### TIMEOUT LOGIC ###
-            timeout = (
-                optional_params.timeout or kwargs.get("request_timeout", 600) or 600
-            )
-            # set timeout for 10 minutes by default
-
-            if (
-                timeout is not None
-                and isinstance(timeout, httpx.Timeout)
-                and supports_httpx_timeout(custom_llm_provider) == False
-            ):
-                read_timeout = timeout.read or 600
-                timeout = read_timeout  # default 10 min timeout
-            elif timeout is not None and not isinstance(timeout, httpx.Timeout):
-                timeout = float(timeout)  # type: ignore
-            elif timeout is None:
-                timeout = 600.0
-
-            _create_file_request = CreateFileRequest(
-                file=file,
-                purpose=purpose,
-                extra_headers=extra_headers,
-                extra_body=extra_body,
-            )
-
-            _is_async = kwargs.pop("acreate_file", False) is True

            response = openai_files_instance.create_file(
                _is_async=_is_async,
@ -513,6 +541,38 @@ def create_file(
                organization=organization,
                create_file_data=_create_file_request,
            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+            )  # type: ignore
+
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret("AZURE_OPENAI_API_KEY")
+                or get_secret("AZURE_API_KEY")
+            )  # type: ignore
+
+            extra_body = optional_params.get("extra_body", {})
+            azure_ad_token: Optional[str] = None
+            if extra_body is not None:
+                azure_ad_token = extra_body.pop("azure_ad_token", None)
+            else:
+                azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+
+            response = azure_files_instance.create_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                create_file_data=_create_file_request,
+            )
        else:
            raise litellm.exceptions.BadRequestError(
                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
@ -533,7 +593,7 @@ def create_file(

 async def afile_content(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
@ -573,7 +633,7 @@ async def afile_content(

 def file_content(
    file_id: str,
-    custom_llm_provider: Literal["openai"] = "openai",
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
    extra_headers: Optional[Dict[str, str]] = None,
    extra_body: Optional[Dict[str, str]] = None,
    **kwargs,
--- a/litellm/fine_tuning/main.py
+++ b/litellm/fine_tuning/main.py
@ -0,0 +1,593 @@
+"""
+Main File for Fine Tuning API implementation
+
+https://platform.openai.com/docs/api-reference/fine-tuning
+
+- fine_tuning.jobs.create()
+- fine_tuning.jobs.list()
+- client.fine_tuning.jobs.list_events()
+"""
+
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union
+
+import httpx
+
+import litellm
+from litellm import get_secret
+from litellm._logging import verbose_logger
+from litellm.llms.fine_tuning_apis.azure import AzureOpenAIFineTuningAPI
+from litellm.llms.fine_tuning_apis.openai import (
+    FineTuningJob,
+    FineTuningJobCreate,
+    OpenAIFineTuningAPI,
+)
+from litellm.llms.fine_tuning_apis.vertex_ai import VertexFineTuningAPI
+from litellm.types.llms.openai import Hyperparameters
+from litellm.types.router import *
+from litellm.utils import supports_httpx_timeout
+
+####### ENVIRONMENT VARIABLES ###################
+openai_fine_tuning_apis_instance = OpenAIFineTuningAPI()
+azure_fine_tuning_apis_instance = AzureOpenAIFineTuningAPI()
+vertex_fine_tuning_apis_instance = VertexFineTuningAPI()
+#################################################
+
+
+async def acreate_fine_tuning_job(
+    model: str,
+    training_file: str,
+    hyperparameters: Optional[Hyperparameters] = {},  # type: ignore
+    suffix: Optional[str] = None,
+    validation_file: Optional[str] = None,
+    integrations: Optional[List[str]] = None,
+    seed: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: Creates and executes a batch from an uploaded file of request
+
+    """
+    verbose_logger.debug(
+        "inside acreate_fine_tuning_job model=%s and kwargs=%s", model, kwargs
+    )
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acreate_fine_tuning_job"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            create_fine_tuning_job,
+            model,
+            training_file,
+            hyperparameters,
+            suffix,
+            validation_file,
+            integrations,
+            seed,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+
+
+def create_fine_tuning_job(
+    model: str,
+    training_file: str,
+    hyperparameters: Optional[Hyperparameters] = {},  # type: ignore
+    suffix: Optional[str] = None,
+    validation_file: Optional[str] = None,
+    integrations: Optional[List[str]] = None,
+    seed: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
+    """
+    Creates a fine-tuning job which begins the process of creating a new model from a given dataset.
+
+    Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
+
+    """
+    try:
+        _is_async = kwargs.pop("acreate_fine_tuning_job", False) is True
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) == False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+
+        # OpenAI
+        if custom_llm_provider == "openai":
+
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+
+            create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
+                exclude_none=True
+            )
+
+            response = openai_fine_tuning_apis_instance.create_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+            )  # type: ignore
+
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret("AZURE_OPENAI_API_KEY")
+                or get_secret("AZURE_API_KEY")
+            )  # type: ignore
+
+            extra_body = optional_params.get("extra_body", {})
+            azure_ad_token: Optional[str] = None
+            if extra_body is not None:
+                azure_ad_token = extra_body.pop("azure_ad_token", None)
+            else:
+                azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+
+            create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
+                exclude_none=True
+            )
+
+            response = azure_fine_tuning_apis_instance.create_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        elif custom_llm_provider == "vertex_ai":
+            api_base = optional_params.api_base or ""
+            vertex_ai_project = (
+                optional_params.vertex_project
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.vertex_location
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = optional_params.vertex_credentials or get_secret(
+                "VERTEXAI_CREDENTIALS"
+            )
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+            response = vertex_fine_tuning_apis_instance.create_fine_tuning_job(
+                _is_async=_is_async,
+                create_fine_tuning_job_data=create_fine_tuning_job_data,
+                vertex_credentials=vertex_credentials,
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                timeout=timeout,
+                api_base=api_base,
+                kwargs=kwargs,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        verbose_logger.error("got exception in create_fine_tuning_job=%s", str(e))
+        raise e
+
+
+async def acancel_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: Immediately cancel a fine-tune job.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acancel_fine_tuning_job"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            cancel_fine_tuning_job,
+            fine_tuning_job_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+
+
+def cancel_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
+    """
+    Immediately cancel a fine-tune job.
+
+    Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
+
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) == False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+
+        _is_async = kwargs.pop("acancel_fine_tuning_job", False) is True
+
+        # OpenAI
+        if custom_llm_provider == "openai":
+
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+
+            response = openai_fine_tuning_apis_instance.cancel_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+            )  # type: ignore
+
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret("AZURE_OPENAI_API_KEY")
+                or get_secret("AZURE_API_KEY")
+            )  # type: ignore
+
+            extra_body = optional_params.get("extra_body", {})
+            azure_ad_token: Optional[str] = None
+            if extra_body is not None:
+                azure_ad_token = extra_body.pop("azure_ad_token", None)
+            else:
+                azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+
+            response = azure_fine_tuning_apis_instance.cancel_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+
+
+async def alist_fine_tuning_jobs(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: List your organization's fine-tuning jobs
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["alist_fine_tuning_jobs"] = True
+
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            list_fine_tuning_jobs,
+            after,
+            limit,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+
+
+def list_fine_tuning_jobs(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    List your organization's fine-tuning jobs
+
+    Params:
+
+    - after: Optional[str] = None, Identifier for the last job from the previous pagination request.
+    - limit: Optional[int] = None, Number of fine-tuning jobs to retrieve. Defaults to 20
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) == False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+
+        _is_async = kwargs.pop("alist_fine_tuning_jobs", False) is True
+
+        # OpenAI
+        if custom_llm_provider == "openai":
+
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+
+            response = openai_fine_tuning_apis_instance.list_fine_tuning_jobs(
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                after=after,
+                limit=limit,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+            )  # type: ignore
+
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret("AZURE_OPENAI_API_KEY")
+                or get_secret("AZURE_API_KEY")
+            )  # type: ignore
+
+            extra_body = optional_params.get("extra_body", {})
+            azure_ad_token: Optional[str] = None
+            if extra_body is not None:
+                azure_ad_token = extra_body.pop("azure_ad_token", None)
+            else:
+                azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+
+            response = azure_fine_tuning_apis_instance.list_fine_tuning_jobs(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                after=after,
+                limit=limit,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
--- a/litellm/integrations/braintrust_logging.py
+++ b/litellm/integrations/braintrust_logging.py
@ -11,6 +11,7 @@ from typing import Literal, Optional

 import dotenv
 import httpx
+from pydantic import BaseModel

 import litellm
 from litellm import verbose_logger
@ -280,22 +281,20 @@ class BraintrustLogger(CustomLogger):
            )  # if litellm_params['metadata'] == None
            metadata = self.add_metadata_from_header(litellm_params, metadata)
            clean_metadata = {}
-            try:
-                metadata = copy.deepcopy(
-                    metadata
-                )  # Avoid modifying the original metadata
-            except:
-                new_metadata = {}
-                for key, value in metadata.items():
-                    if (
-                        isinstance(value, list)
-                        or isinstance(value, dict)
-                        or isinstance(value, str)
-                        or isinstance(value, int)
-                        or isinstance(value, float)
-                    ):
-                        new_metadata[key] = copy.deepcopy(value)
-                metadata = new_metadata
+            new_metadata = {}
+            for key, value in metadata.items():
+                if (
+                    isinstance(value, list)
+                    or isinstance(value, dict)
+                    or isinstance(value, str)
+                    or isinstance(value, int)
+                    or isinstance(value, float)
+                ):
+                    new_metadata[key] = value
+                elif isinstance(value, BaseModel):
+                    new_metadata[key] = value.model_dump_json()
+
+            metadata = new_metadata

            tags = []
            if isinstance(metadata, dict):
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -10,7 +10,7 @@ from pydantic import BaseModel
 from litellm.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.types.llms.openai import ChatCompletionRequest
-from litellm.types.utils import ModelResponse
+from litellm.types.utils import AdapterCompletionStreamWrapper, ModelResponse


 class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
@ -76,7 +76,9 @@ class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callbac
        """
        pass

-    def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
+    def translate_completion_output_params_streaming(
+        self, completion_stream: Any
+    ) -> Optional[AdapterCompletionStreamWrapper]:
        """
        Translates the streaming chunk, from the OpenAI format to the custom format.
        """
--- a/litellm/integrations/datadog.py
+++ b/litellm/integrations/datadog.py
@ -1,5 +1,5 @@
 #### What this does ####
-#    On success + failure, log events to Supabase
+#    On success + failure, log events to Datadog

 import dotenv, os
 import requests  # type: ignore
@ -9,6 +9,21 @@ import litellm, uuid
 from litellm._logging import print_verbose, verbose_logger


+def make_json_serializable(payload):
+    for key, value in payload.items():
+        try:
+            if isinstance(value, dict):
+                # recursively sanitize dicts
+                payload[key] = make_json_serializable(value.copy())
+            elif not isinstance(value, (str, int, float, bool, type(None))):
+                # everything else becomes a string
+                payload[key] = str(value)
+        except:
+            # non blocking if it can't cast to a str
+            pass
+    return payload
+
+
 class DataDogLogger:
    # Class variables or attributes
    def __init__(
@ -61,7 +76,7 @@ class DataDogLogger:
            id = response_obj.get("id", str(uuid.uuid4()))
            usage = dict(usage)
            try:
-                response_time = (end_time - start_time).total_seconds()
+                response_time = (end_time - start_time).total_seconds() * 1000
            except:
                response_time = None

@ -91,12 +106,12 @@ class DataDogLogger:
                "id": id,
                "call_type": call_type,
                "cache_hit": cache_hit,
-                "startTime": start_time,
-                "endTime": end_time,
-                "responseTime (seconds)": response_time,
+                "start_time": start_time,
+                "end_time": end_time,
+                "response_time": response_time,
                "model": kwargs.get("model", ""),
                "user": kwargs.get("user", ""),
-                "modelParameters": optional_params,
+                "model_parameters": optional_params,
                "spend": kwargs.get("response_cost", 0),
                "messages": messages,
                "response": response_obj,
@ -104,13 +119,7 @@ class DataDogLogger:
                "metadata": clean_metadata,
            }

-            # Ensure everything in the payload is converted to str
-            for key, value in payload.items():
-                try:
-                    payload[key] = str(value)
-                except:
-                    # non blocking if it can't cast to a str
-                    pass
+            make_json_serializable(payload)
            import json

            payload = json.dumps(payload)
--- a/litellm/integrations/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket.py
@ -0,0 +1,203 @@
+import json
+import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional, TypedDict, Union
+
+import httpx
+from pydantic import BaseModel, Field
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.litellm_core_utils.logging_utils import (
+    convert_litellm_response_object_to_dict,
+)
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.proxy._types import CommonProxyErrors, SpendLogsPayload
+
+
+class RequestKwargs(TypedDict):
+    model: Optional[str]
+    messages: Optional[List]
+    optional_params: Optional[Dict[str, Any]]
+
+
+class GCSBucketPayload(TypedDict):
+    request_kwargs: Optional[RequestKwargs]
+    response_obj: Optional[Dict]
+    start_time: str
+    end_time: str
+
+
+class GCSBucketLogger(CustomLogger):
+    def __init__(self) -> None:
+        from litellm.proxy.proxy_server import premium_user
+
+        if premium_user is not True:
+            raise ValueError(
+                f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
+            )
+
+        self.async_httpx_client = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        self.path_service_account_json = os.getenv("GCS_PATH_SERVICE_ACCOUNT", None)
+        self.BUCKET_NAME = os.getenv("GCS_BUCKET_NAME", None)
+
+        if self.BUCKET_NAME is None:
+            raise ValueError(
+                "GCS_BUCKET_NAME is not set in the environment, but GCS Bucket is being used as a logging callback. Please set 'GCS_BUCKET_NAME' in the environment."
+            )
+
+        if self.path_service_account_json is None:
+            raise ValueError(
+                "GCS_PATH_SERVICE_ACCOUNT is not set in the environment, but GCS Bucket is being used as a logging callback. Please set 'GCS_PATH_SERVICE_ACCOUNT' in the environment."
+            )
+        pass
+
+    #### ASYNC ####
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        from litellm.proxy.proxy_server import premium_user
+
+        if premium_user is not True:
+            raise ValueError(
+                f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}"
+            )
+        try:
+            verbose_logger.debug(
+                "GCS Logger: async_log_success_event logging kwargs: %s, response_obj: %s",
+                kwargs,
+                response_obj,
+            )
+
+            start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
+            end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S")
+            headers = await self.construct_request_headers()
+
+            logging_payload: GCSBucketPayload = await self.get_gcs_payload(
+                kwargs, response_obj, start_time_str, end_time_str
+            )
+
+            object_name = response_obj["id"]
+            response = await self.async_httpx_client.post(
+                headers=headers,
+                url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}",
+                json=logging_payload,
+            )
+
+            if response.status_code != 200:
+                verbose_logger.error("GCS Bucket logging error: %s", str(response.text))
+
+            verbose_logger.debug("GCS Bucket response %s", response)
+            verbose_logger.debug("GCS Bucket status code %s", response.status_code)
+            verbose_logger.debug("GCS Bucket response.text %s", response.text)
+        except Exception as e:
+            verbose_logger.error("GCS Bucket logging error: %s", str(e))
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        pass
+
+    async def construct_request_headers(self) -> Dict[str, str]:
+        from litellm import vertex_chat_completion
+
+        auth_header, _ = vertex_chat_completion._get_token_and_url(
+            model="gcs-bucket",
+            vertex_credentials=self.path_service_account_json,
+            vertex_project=None,
+            vertex_location=None,
+            gemini_api_key=None,
+            stream=None,
+            custom_llm_provider="vertex_ai",
+            api_base=None,
+        )
+        verbose_logger.debug("constructed auth_header %s", auth_header)
+        headers = {
+            "Authorization": f"Bearer {auth_header}",  # auth_header
+            "Content-Type": "application/json",
+        }
+
+        return headers
+
+    async def get_gcs_payload(
+        self, kwargs, response_obj, start_time, end_time
+    ) -> GCSBucketPayload:
+        request_kwargs = RequestKwargs(
+            model=kwargs.get("model", None),
+            messages=kwargs.get("messages", None),
+            optional_params=kwargs.get("optional_params", None),
+        )
+        response_dict = {}
+        response_dict = convert_litellm_response_object_to_dict(
+            response_obj=response_obj
+        )
+
+        gcs_payload: GCSBucketPayload = GCSBucketPayload(
+            request_kwargs=request_kwargs,
+            response_obj=response_dict,
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+        return gcs_payload
+
+    async def download_gcs_object(self, object_name):
+        """
+        Download an object from GCS.
+
+        https://cloud.google.com/storage/docs/downloading-objects#download-object-json
+        """
+        try:
+            headers = await self.construct_request_headers()
+            url = f"https://storage.googleapis.com/storage/v1/b/{self.BUCKET_NAME}/o/{object_name}?alt=media"
+
+            # Send the GET request to download the object
+            response = await self.async_httpx_client.get(url=url, headers=headers)
+
+            if response.status_code != 200:
+                verbose_logger.error(
+                    "GCS object download error: %s", str(response.text)
+                )
+                return None
+
+            verbose_logger.debug(
+                "GCS object download response status code: %s", response.status_code
+            )
+
+            # Return the content of the downloaded object
+            return response.content
+
+        except Exception as e:
+            verbose_logger.error("GCS object download error: %s", str(e))
+            return None
+
+    async def delete_gcs_object(self, object_name):
+        """
+        Delete an object from GCS.
+        """
+        try:
+            headers = await self.construct_request_headers()
+            url = f"https://storage.googleapis.com/storage/v1/b/{self.BUCKET_NAME}/o/{object_name}"
+
+            # Send the DELETE request to delete the object
+            response = await self.async_httpx_client.delete(url=url, headers=headers)
+
+            if (response.status_code != 200) or (response.status_code != 204):
+                verbose_logger.error(
+                    "GCS object delete error: %s, status code: %s",
+                    str(response.text),
+                    response.status_code,
+                )
+                return None
+
+            verbose_logger.debug(
+                "GCS object delete response status code: %s, response: %s",
+                response.status_code,
+                response.text,
+            )
+
+            # Return the content of the downloaded object
+            return response.text
+
+        except Exception as e:
+            verbose_logger.error("GCS object download error: %s", str(e))
+            return None
--- a/litellm/integrations/helicone.py
+++ b/litellm/integrations/helicone.py
@ -31,13 +31,36 @@ class HeliconeLogger:
        prompt += f"{AI_PROMPT}"
        claude_provider_request = {"model": model, "prompt": prompt}

+        choice = response_obj["choices"][0]
+        message = choice["message"]
+
+        content = []
+        if "tool_calls" in message and message["tool_calls"]:
+            for tool_call in message["tool_calls"]:
+                content.append({
+                    "type": "tool_use",
+                    "id": tool_call["id"],
+                    "name": tool_call["function"]["name"],
+                    "input": tool_call["function"]["arguments"]
+                })
+        elif "content" in message and message["content"]:
+            content = [{"type": "text", "text": message["content"]}]
+
        claude_response_obj = {
-            "completion": response_obj["choices"][0]["message"]["content"],
+            "id": response_obj["id"],
+            "type": "message",
+            "role": "assistant",
            "model": model,
-            "stop_reason": "stop_sequence",
+            "content": content,
+            "stop_reason": choice["finish_reason"],
+            "stop_sequence": None,
+            "usage": {
+                "input_tokens": response_obj["usage"]["prompt_tokens"],
+                "output_tokens": response_obj["usage"]["completion_tokens"]
+            }
        }

-        return claude_provider_request, claude_response_obj
+        return claude_response_obj
    
    @staticmethod
    def add_metadata_from_header(litellm_params: dict, metadata: dict) -> dict:
@ -96,7 +119,7 @@ class HeliconeLogger:
                response_obj = response_obj.json()

            if "claude" in model:
-                provider_request, response_obj = self.claude_mapping(
+                response_obj = self.claude_mapping(
                    model=model, messages=messages, response_obj=response_obj
                )

@ -107,7 +130,11 @@ class HeliconeLogger:
            }

            # Code to be executed
+            provider_url = self.provider_url
            url = "https://api.hconeai.com/oai/v1/log"
+            if "claude" in model:
+                url = "https://api.hconeai.com/anthropic/v1/log"
+                provider_url = "https://api.anthropic.com/v1/messages"
            headers = {
                "Authorization": f"Bearer {self.key}",
                "Content-Type": "application/json",
@ -124,7 +151,7 @@ class HeliconeLogger:
            meta.update(metadata)
            data = {
                "providerRequest": {
-                    "url": self.provider_url,
+                    "url": provider_url,
                    "json": provider_request,
                    "meta": meta,
                },
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -5,6 +5,7 @@ import os
 import traceback

 from packaging.version import Version
+from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_logger
@ -144,6 +145,10 @@ class LangFuseLogger:
                f"Langfuse Logging - Enters logging function for model {kwargs}"
            )

+            # set default values for input/output for langfuse logging
+            input = None
+            output = None
+
            litellm_params = kwargs.get("litellm_params", {})
            litellm_call_id = kwargs.get("litellm_call_id", None)
            metadata = (
@ -198,6 +203,11 @@ class LangFuseLogger:
            ):
                input = prompt
                output = response_obj["data"]
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.TranscriptionResponse
+            ):
+                input = prompt
+                output = response_obj["text"]
            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
            trace_id = None
            generation_id = None
@ -322,7 +332,7 @@ class LangFuseLogger:
                metadata = copy.deepcopy(
                    metadata
                )  # Avoid modifying the original metadata
-            except:
+            except Exception:
                new_metadata = {}
                for key, value in metadata.items():
                    if (
@ -333,6 +343,8 @@ class LangFuseLogger:
                        or isinstance(value, float)
                    ):
                        new_metadata[key] = copy.deepcopy(value)
+                    elif isinstance(value, BaseModel):
+                        new_metadata[key] = value.model_dump()
                metadata = new_metadata

            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
--- a/litellm/integrations/logfire_logger.py
+++ b/litellm/integrations/logfire_logger.py
@ -2,10 +2,6 @@
 #    On success + failure, log events to Logfire

 import os
-
-import dotenv
-
-dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 import uuid
 from enum import Enum
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -119,6 +119,7 @@ class OpenTelemetry(CustomLogger):
        parent_otel_span: Optional[Span] = None,
        start_time: Optional[Union[datetime, float]] = None,
        end_time: Optional[Union[datetime, float]] = None,
+        event_metadata: Optional[dict] = None,
    ):
        from datetime import datetime

@ -149,15 +150,26 @@ class OpenTelemetry(CustomLogger):
            service_logging_span.set_attribute(
                key="service", value=payload.service.value
            )
+
+            if event_metadata:
+                for key, value in event_metadata.items():
+                    if isinstance(value, dict):
+                        try:
+                            value = str(value)
+                        except Exception:
+                            value = "litllm logging error - could_not_json_serialize"
+                    service_logging_span.set_attribute(key, value)
            service_logging_span.set_status(Status(StatusCode.OK))
            service_logging_span.end(end_time=_end_time_ns)

    async def async_service_failure_hook(
        self,
        payload: ServiceLoggerPayload,
+        error: Optional[str] = "",
        parent_otel_span: Optional[Span] = None,
        start_time: Optional[Union[datetime, float]] = None,
        end_time: Optional[Union[float, datetime]] = None,
+        event_metadata: Optional[dict] = None,
    ):
        from datetime import datetime

@ -188,6 +200,17 @@ class OpenTelemetry(CustomLogger):
            service_logging_span.set_attribute(
                key="service", value=payload.service.value
            )
+            if error:
+                service_logging_span.set_attribute(key="error", value=error)
+            if event_metadata:
+                for key, value in event_metadata.items():
+                    if isinstance(value, dict):
+                        try:
+                            value = str(value)
+                        except Exception:
+                            value = "litllm logging error - could_not_json_serialize"
+                    service_logging_span.set_attribute(key, value)
+
            service_logging_span.set_status(Status(StatusCode.ERROR))
            service_logging_span.end(end_time=_end_time_ns)

@ -258,15 +281,26 @@ class OpenTelemetry(CustomLogger):
    def _handle_failure(self, kwargs, response_obj, start_time, end_time):
        from opentelemetry.trace import Status, StatusCode

+        verbose_logger.debug(
+            "OpenTelemetry Logger: Failure HandlerLogging kwargs: %s, OTEL config settings=%s",
+            kwargs,
+            self.config,
+        )
+        _parent_context, parent_otel_span = self._get_span_context(kwargs)
+
+        # Span 1: Requst sent to litellm SDK
        span = self.tracer.start_span(
            name=self._get_span_name(kwargs),
            start_time=self._to_ns(start_time),
-            context=self._get_span_context(kwargs),
+            context=_parent_context,
        )
        span.set_status(Status(StatusCode.ERROR))
        self.set_attributes(span, kwargs, response_obj)
        span.end(end_time=self._to_ns(end_time))

+        if parent_otel_span is not None:
+            parent_otel_span.end(end_time=self._to_ns(datetime.now()))
+
    def set_tools_attributes(self, span: Span, tools):
        import json

@ -299,153 +333,165 @@ class OpenTelemetry(CustomLogger):
        return isinstance(value, (str, bool, int, float))

    def set_attributes(self, span: Span, kwargs, response_obj):
-        if self.callback_name == "arize":
-            from litellm.integrations.arize_ai import set_arize_ai_attributes
+        try:
+            if self.callback_name == "arize":
+                from litellm.integrations.arize_ai import set_arize_ai_attributes

-            set_arize_ai_attributes(span, kwargs, response_obj)
-            return
-        from litellm.proxy._types import SpanAttributes
+                set_arize_ai_attributes(span, kwargs, response_obj)
+                return
+            from litellm.proxy._types import SpanAttributes

-        optional_params = kwargs.get("optional_params", {})
-        litellm_params = kwargs.get("litellm_params", {}) or {}
+            optional_params = kwargs.get("optional_params", {})
+            litellm_params = kwargs.get("litellm_params", {}) or {}

-        # https://github.com/open-telemetry/semantic-conventions/blob/main/model/registry/gen-ai.yaml
-        # Following Conventions here: https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
-        #############################################
-        ############ LLM CALL METADATA ##############
-        #############################################
-        metadata = litellm_params.get("metadata", {}) or {}
+            # https://github.com/open-telemetry/semantic-conventions/blob/main/model/registry/gen-ai.yaml
+            # Following Conventions here: https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
+            #############################################
+            ############ LLM CALL METADATA ##############
+            #############################################
+            metadata = litellm_params.get("metadata", {}) or {}

-        clean_metadata = redact_user_api_key_info(metadata=metadata)
+            clean_metadata = redact_user_api_key_info(metadata=metadata)

-        for key, value in clean_metadata.items():
-            if self.is_primitive(value):
-                span.set_attribute("metadata.{}".format(key), value)
+            for key, value in clean_metadata.items():
+                if self.is_primitive(value):
+                    span.set_attribute("metadata.{}".format(key), value)

-        #############################################
-        ########## LLM Request Attributes ###########
-        #############################################
+            #############################################
+            ########## LLM Request Attributes ###########
+            #############################################

-        # The name of the LLM a request is being made to
-        if kwargs.get("model"):
-            span.set_attribute(SpanAttributes.LLM_REQUEST_MODEL, kwargs.get("model"))
+            # The name of the LLM a request is being made to
+            if kwargs.get("model"):
+                span.set_attribute(
+                    SpanAttributes.LLM_REQUEST_MODEL, kwargs.get("model")
+                )

-        # The Generative AI Provider: Azure, OpenAI, etc.
-        span.set_attribute(
-            SpanAttributes.LLM_SYSTEM,
-            litellm_params.get("custom_llm_provider", "Unknown"),
-        )
-
-        # The maximum number of tokens the LLM generates for a request.
-        if optional_params.get("max_tokens"):
+            # The Generative AI Provider: Azure, OpenAI, etc.
            span.set_attribute(
-                SpanAttributes.LLM_REQUEST_MAX_TOKENS, optional_params.get("max_tokens")
+                SpanAttributes.LLM_SYSTEM,
+                litellm_params.get("custom_llm_provider", "Unknown"),
            )

-        # The temperature setting for the LLM request.
-        if optional_params.get("temperature"):
+            # The maximum number of tokens the LLM generates for a request.
+            if optional_params.get("max_tokens"):
+                span.set_attribute(
+                    SpanAttributes.LLM_REQUEST_MAX_TOKENS,
+                    optional_params.get("max_tokens"),
+                )
+
+            # The temperature setting for the LLM request.
+            if optional_params.get("temperature"):
+                span.set_attribute(
+                    SpanAttributes.LLM_REQUEST_TEMPERATURE,
+                    optional_params.get("temperature"),
+                )
+
+            # The top_p sampling setting for the LLM request.
+            if optional_params.get("top_p"):
+                span.set_attribute(
+                    SpanAttributes.LLM_REQUEST_TOP_P, optional_params.get("top_p")
+                )
+
            span.set_attribute(
-                SpanAttributes.LLM_REQUEST_TEMPERATURE,
-                optional_params.get("temperature"),
+                SpanAttributes.LLM_IS_STREAMING,
+                str(optional_params.get("stream", False)),
            )

-        # The top_p sampling setting for the LLM request.
-        if optional_params.get("top_p"):
-            span.set_attribute(
-                SpanAttributes.LLM_REQUEST_TOP_P, optional_params.get("top_p")
-            )
+            if optional_params.get("tools"):
+                tools = optional_params["tools"]
+                self.set_tools_attributes(span, tools)

-        span.set_attribute(
-            SpanAttributes.LLM_IS_STREAMING, str(optional_params.get("stream", False))
-        )
+            if optional_params.get("user"):
+                span.set_attribute(SpanAttributes.LLM_USER, optional_params.get("user"))

-        if optional_params.get("tools"):
-            tools = optional_params["tools"]
-            self.set_tools_attributes(span, tools)
-
-        if optional_params.get("user"):
-            span.set_attribute(SpanAttributes.LLM_USER, optional_params.get("user"))
-
-        if kwargs.get("messages"):
-            for idx, prompt in enumerate(kwargs.get("messages")):
-                if prompt.get("role"):
-                    span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{idx}.role",
-                        prompt.get("role"),
-                    )
-
-                if prompt.get("content"):
-                    if not isinstance(prompt.get("content"), str):
-                        prompt["content"] = str(prompt.get("content"))
-                    span.set_attribute(
-                        f"{SpanAttributes.LLM_PROMPTS}.{idx}.content",
-                        prompt.get("content"),
-                    )
-        #############################################
-        ########## LLM Response Attributes ##########
-        #############################################
-        if response_obj.get("choices"):
-            for idx, choice in enumerate(response_obj.get("choices")):
-                if choice.get("finish_reason"):
-                    span.set_attribute(
-                        f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.finish_reason",
-                        choice.get("finish_reason"),
-                    )
-                if choice.get("message"):
-                    if choice.get("message").get("role"):
+            if kwargs.get("messages"):
+                for idx, prompt in enumerate(kwargs.get("messages")):
+                    if prompt.get("role"):
                        span.set_attribute(
-                            f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.role",
-                            choice.get("message").get("role"),
+                            f"{SpanAttributes.LLM_PROMPTS}.{idx}.role",
+                            prompt.get("role"),
                        )
-                    if choice.get("message").get("content"):
-                        if not isinstance(choice.get("message").get("content"), str):
-                            choice["message"]["content"] = str(
-                                choice.get("message").get("content")
+
+                    if prompt.get("content"):
+                        if not isinstance(prompt.get("content"), str):
+                            prompt["content"] = str(prompt.get("content"))
+                        span.set_attribute(
+                            f"{SpanAttributes.LLM_PROMPTS}.{idx}.content",
+                            prompt.get("content"),
+                        )
+            #############################################
+            ########## LLM Response Attributes ##########
+            #############################################
+            if response_obj is not None:
+                if response_obj.get("choices"):
+                    for idx, choice in enumerate(response_obj.get("choices")):
+                        if choice.get("finish_reason"):
+                            span.set_attribute(
+                                f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.finish_reason",
+                                choice.get("finish_reason"),
                            )
-                        span.set_attribute(
-                            f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.content",
-                            choice.get("message").get("content"),
-                        )
+                        if choice.get("message"):
+                            if choice.get("message").get("role"):
+                                span.set_attribute(
+                                    f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.role",
+                                    choice.get("message").get("role"),
+                                )
+                            if choice.get("message").get("content"):
+                                if not isinstance(
+                                    choice.get("message").get("content"), str
+                                ):
+                                    choice["message"]["content"] = str(
+                                        choice.get("message").get("content")
+                                    )
+                                span.set_attribute(
+                                    f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.content",
+                                    choice.get("message").get("content"),
+                                )

-                    message = choice.get("message")
-                    tool_calls = message.get("tool_calls")
-                    if tool_calls:
-                        span.set_attribute(
-                            f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
-                            tool_calls[0].get("function").get("name"),
-                        )
-                        span.set_attribute(
-                            f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
-                            tool_calls[0].get("function").get("arguments"),
-                        )
+                            message = choice.get("message")
+                            tool_calls = message.get("tool_calls")
+                            if tool_calls:
+                                span.set_attribute(
+                                    f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
+                                    tool_calls[0].get("function").get("name"),
+                                )
+                                span.set_attribute(
+                                    f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
+                                    tool_calls[0].get("function").get("arguments"),
+                                )

-        # The unique identifier for the completion.
-        if response_obj.get("id"):
-            span.set_attribute("gen_ai.response.id", response_obj.get("id"))
+                # The unique identifier for the completion.
+                if response_obj.get("id"):
+                    span.set_attribute("gen_ai.response.id", response_obj.get("id"))

-        # The model used to generate the response.
-        if response_obj.get("model"):
-            span.set_attribute(
-                SpanAttributes.LLM_RESPONSE_MODEL, response_obj.get("model")
-            )
+                # The model used to generate the response.
+                if response_obj.get("model"):
+                    span.set_attribute(
+                        SpanAttributes.LLM_RESPONSE_MODEL, response_obj.get("model")
+                    )

-        usage = response_obj.get("usage")
-        if usage:
-            span.set_attribute(
-                SpanAttributes.LLM_USAGE_TOTAL_TOKENS,
-                usage.get("total_tokens"),
-            )
+                usage = response_obj.get("usage")
+                if usage:
+                    span.set_attribute(
+                        SpanAttributes.LLM_USAGE_TOTAL_TOKENS,
+                        usage.get("total_tokens"),
+                    )

-            # The number of tokens used in the LLM response (completion).
-            span.set_attribute(
-                SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
-                usage.get("completion_tokens"),
-            )
+                    # The number of tokens used in the LLM response (completion).
+                    span.set_attribute(
+                        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
+                        usage.get("completion_tokens"),
+                    )

-            # The number of tokens used in the LLM prompt.
-            span.set_attribute(
-                SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
-                usage.get("prompt_tokens"),
+                    # The number of tokens used in the LLM prompt.
+                    span.set_attribute(
+                        SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
+                        usage.get("prompt_tokens"),
+                    )
+        except Exception as e:
+            verbose_logger.error(
+                "OpenTelemetry logging error in set_attributes %s", str(e)
            )

    def set_raw_request_attributes(self, span: Span, kwargs, response_obj):
@ -463,7 +509,7 @@ class OpenTelemetry(CustomLogger):
        #############################################

        # OTEL Attributes for the RAW Request to https://docs.anthropic.com/en/api/messages
-        if complete_input_dict:
+        if complete_input_dict and isinstance(complete_input_dict, dict):
            for param, val in complete_input_dict.items():
                if not isinstance(val, str):
                    val = str(val)
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -1263,6 +1263,10 @@ Model Info:

            if self.alerting is None or "email" not in self.alerting:
                # do nothing if user does not want email alerts
+                verbose_proxy_logger.error(
+                    "Error sending email alert - 'email' not in self.alerting %s",
+                    self.alerting,
+                )
                return False
            from litellm.proxy.proxy_server import premium_user, prisma_client

--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -1,5 +1,6 @@
 # What is this?
 ## Helper utilities
+from typing import List, Literal, Optional, Tuple


 def map_finish_reason(
@ -54,3 +55,31 @@ def remove_index_from_tool_calls(messages, tool_calls):
                    tool_call.pop("index")

    return
+
+
+def get_litellm_metadata_from_kwargs(kwargs: dict):
+    """
+    Helper to get litellm metadata from all litellm request kwargs
+    """
+    return kwargs.get("litellm_params", {}).get("metadata", {})
+
+
+# Helper functions used for OTEL logging
+def _get_parent_otel_span_from_kwargs(kwargs: Optional[dict] = None):
+    try:
+        if kwargs is None:
+            return None
+        litellm_params = kwargs.get("litellm_params")
+        _metadata = kwargs.get("metadata") or {}
+        if "litellm_parent_otel_span" in _metadata:
+            return _metadata["litellm_parent_otel_span"]
+        elif (
+            litellm_params is not None
+            and litellm_params.get("metadata") is not None
+            and "litellm_parent_otel_span" in litellm_params.get("metadata", {})
+        ):
+            return litellm_params["metadata"]["litellm_parent_otel_span"]
+        elif "litellm_parent_otel_span" in kwargs:
+            return kwargs["litellm_parent_otel_span"]
+    except:
+        return None
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -10,7 +10,9 @@ import sys
 import time
 import traceback
 import uuid
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel

 import litellm
 from litellm import (
@ -59,6 +61,7 @@ from ..integrations.custom_logger import CustomLogger
 from ..integrations.datadog import DataDogLogger
 from ..integrations.dynamodb import DyanmoDBLogger
 from ..integrations.galileo import GalileoObserve
+from ..integrations.gcs_bucket import GCSBucketLogger
 from ..integrations.greenscale import GreenscaleLogger
 from ..integrations.helicone import HeliconeLogger
 from ..integrations.lago import LagoLogger
@ -231,6 +234,9 @@ class Logging:
        ):
            self.custom_pricing = True

+        if "custom_llm_provider" in self.model_call_details:
+            self.custom_llm_provider = self.model_call_details["custom_llm_provider"]
+
    def _pre_call(self, input, api_key, model=None, additional_args={}):
        """
        Common helper function across the sync + async pre-call function
@ -500,6 +506,44 @@ class Logging:
                )
            )

+    def _response_cost_calculator(
+        self,
+        result: Union[
+            ModelResponse,
+            EmbeddingResponse,
+            ImageResponse,
+            TranscriptionResponse,
+            TextCompletionResponse,
+            HttpxBinaryResponseContent,
+        ],
+    ):
+        """
+        Calculate response cost using result + logging object variables.
+
+        used for consistent cost calculation across response headers + logging integrations.
+        """
+        ## RESPONSE COST ##
+        custom_pricing = use_custom_pricing_for_model(
+            litellm_params=self.litellm_params
+        )
+
+        response_cost = litellm.response_cost_calculator(
+            response_object=result,
+            model=self.model,
+            cache_hit=self.model_call_details.get("cache_hit", False),
+            custom_llm_provider=self.model_call_details.get(
+                "custom_llm_provider", None
+            ),
+            base_model=_get_base_model_from_metadata(
+                model_call_details=self.model_call_details
+            ),
+            call_type=self.call_type,
+            optional_params=self.optional_params,
+            custom_pricing=custom_pricing,
+        )
+
+        return response_cost
+
    def _success_handler_helper_fn(
        self, result=None, start_time=None, end_time=None, cache_hit=None
    ):
@ -529,25 +573,32 @@ class Logging:
                    or isinstance(result, TextCompletionResponse)
                    or isinstance(result, HttpxBinaryResponseContent)  # tts
                ):
+                    ## RESPONSE COST ##
                    custom_pricing = use_custom_pricing_for_model(
                        litellm_params=self.litellm_params
                    )
                    self.model_call_details["response_cost"] = (
-                        litellm.response_cost_calculator(
-                            response_object=result,
-                            model=self.model,
-                            cache_hit=self.model_call_details.get("cache_hit", False),
-                            custom_llm_provider=self.model_call_details.get(
-                                "custom_llm_provider", None
-                            ),
-                            base_model=_get_base_model_from_metadata(
-                                model_call_details=self.model_call_details
-                            ),
-                            call_type=self.call_type,
-                            optional_params=self.optional_params,
-                            custom_pricing=custom_pricing,
-                        )
+                        self._response_cost_calculator(result=result)
                    )
+
+                    ## HIDDEN PARAMS ##
+                    if hasattr(result, "_hidden_params"):
+                        # add to metadata for logging
+                        if self.model_call_details.get("litellm_params") is not None:
+                            self.model_call_details["litellm_params"].setdefault(
+                                "metadata", {}
+                            )
+                            if (
+                                self.model_call_details["litellm_params"]["metadata"]
+                                is None
+                            ):
+                                self.model_call_details["litellm_params"][
+                                    "metadata"
+                                ] = {}
+
+                            self.model_call_details["litellm_params"]["metadata"][
+                                "hidden_params"
+                            ] = result._hidden_params
            else:  # streaming chunks + image gen.
                self.model_call_details["response_cost"] = None

@ -1220,7 +1271,9 @@ class Logging:
        """
        Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
        """
-        print_verbose("Logging Details LiteLLM-Async Success Call")
+        print_verbose(
+            "Logging Details LiteLLM-Async Success Call, cache_hit={}".format(cache_hit)
+        )
        start_time, end_time, result = self._success_handler_helper_fn(
            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
        )
@ -1490,6 +1543,13 @@ class Logging:
        self.model_call_details["end_time"] = end_time
        self.model_call_details.setdefault("original_response", None)
        self.model_call_details["response_cost"] = 0
+
+        if hasattr(exception, "headers") and isinstance(exception.headers, dict):
+            self.model_call_details.setdefault("litellm_params", {})
+            metadata = (
+                self.model_call_details["litellm_params"].get("metadata", {}) or {}
+            )
+            metadata.update(exception.headers)
        return start_time, end_time

    def failure_handler(
@ -1962,6 +2022,14 @@ def _init_custom_logger_compatible_class(
        _langsmith_logger = LangsmithLogger()
        _in_memory_loggers.append(_langsmith_logger)
        return _langsmith_logger  # type: ignore
+    elif logging_integration == "gcs_bucket":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, GCSBucketLogger):
+                return callback  # type: ignore
+
+        _gcs_bucket_logger = GCSBucketLogger()
+        _in_memory_loggers.append(_gcs_bucket_logger)
+        return _gcs_bucket_logger  # type: ignore
    elif logging_integration == "arize":
        if "ARIZE_SPACE_KEY" not in os.environ:
            raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
@ -2076,6 +2144,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
                return callback
+    elif logging_integration == "gcs_bucket":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, GCSBucketLogger):
+                return callback
    elif logging_integration == "otel":
        from litellm.integrations.opentelemetry import OpenTelemetry

--- a/litellm/litellm_core_utils/llm_cost_calc/google.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/google.py
@ -44,7 +44,12 @@ def cost_router(
    Returns
        - str, the specific google cost calc function it should route to.
    """
-    if custom_llm_provider == "vertex_ai" and "claude" in model:
+    if custom_llm_provider == "vertex_ai" and (
+        "claude" in model
+        or "llama" in model
+        or "mistral" in model
+        or "codestral" in model
+    ):
        return "cost_per_token"
    elif custom_llm_provider == "gemini":
        return "cost_per_token"
--- a/litellm/litellm_core_utils/logging_utils.py
+++ b/litellm/litellm_core_utils/logging_utils.py
@ -0,0 +1,22 @@
+from typing import Any
+
+import litellm
+
+"""
+Helper utils used for logging callbacks
+"""
+
+
+def convert_litellm_response_object_to_dict(response_obj: Any) -> dict:
+    """
+    Convert a LiteLLM response object to a dictionary
+
+    """
+    if isinstance(response_obj, dict):
+        return response_obj
+    for _type in litellm.ALL_LITELLM_RESPONSE_TYPES:
+        if isinstance(response_obj, _type):
+            return response_obj.model_dump()
+
+    # If it's not a LiteLLM type, return the object as is
+    return dict(response_obj)
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -5,13 +5,16 @@ import time
 import types
 from enum import Enum
 from functools import partial
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Literal, Optional, Tuple, Union

 import httpx  # type: ignore
 import requests  # type: ignore
+from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice

 import litellm
 import litellm.litellm_core_utils
+import litellm.types
+import litellm.types.utils
 from litellm import verbose_logger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import (
@ -33,8 +36,12 @@ from litellm.types.llms.anthropic import (
    AnthropicResponseUsageBlock,
    ContentBlockDelta,
    ContentBlockStart,
+    ContentJsonBlockDelta,
+    ContentTextBlockDelta,
    MessageBlockDelta,
+    MessageDelta,
    MessageStartBlock,
+    UsageDelta,
 )
 from litellm.types.llms.openai import (
    AllMessageValues,
@ -72,7 +79,7 @@ class AnthropicConstants(Enum):


 class AnthropicError(Exception):
-    def __init__(self, status_code, message):
+    def __init__(self, status_code: int, message):
        self.status_code = status_code
        self.message: str = message
        self.request = httpx.Request(
@ -464,7 +471,8 @@ class AnthropicConfig:
        # extract usage
        usage: litellm.Usage = getattr(response, "usage")
        anthropic_usage = AnthropicResponseUsageBlock(
-            input_tokens=usage.prompt_tokens, output_tokens=usage.completion_tokens
+            input_tokens=usage.prompt_tokens or 0,
+            output_tokens=usage.completion_tokens or 0,
        )
        translated_obj = AnthropicResponse(
            id=response.id,
@ -479,6 +487,74 @@ class AnthropicConfig:

        return translated_obj

+    def _translate_streaming_openai_chunk_to_anthropic(
+        self, choices: List[OpenAIStreamingChoice]
+    ) -> Tuple[
+        Literal["text_delta", "input_json_delta"],
+        Union[ContentTextBlockDelta, ContentJsonBlockDelta],
+    ]:
+        text: str = ""
+        partial_json: Optional[str] = None
+        for choice in choices:
+            if choice.delta.content is not None:
+                text += choice.delta.content
+            elif choice.delta.tool_calls is not None:
+                partial_json = ""
+                for tool in choice.delta.tool_calls:
+                    if (
+                        tool.function is not None
+                        and tool.function.arguments is not None
+                    ):
+                        partial_json += tool.function.arguments
+
+        if partial_json is not None:
+            return "input_json_delta", ContentJsonBlockDelta(
+                type="input_json_delta", partial_json=partial_json
+            )
+        else:
+            return "text_delta", ContentTextBlockDelta(type="text_delta", text=text)
+
+    def translate_streaming_openai_response_to_anthropic(
+        self, response: litellm.ModelResponse
+    ) -> Union[ContentBlockDelta, MessageBlockDelta]:
+        ## base case - final chunk w/ finish reason
+        if response.choices[0].finish_reason is not None:
+            delta = MessageDelta(
+                stop_reason=self._translate_openai_finish_reason_to_anthropic(
+                    response.choices[0].finish_reason
+                ),
+            )
+            if getattr(response, "usage", None) is not None:
+                litellm_usage_chunk: Optional[litellm.Usage] = response.usage  # type: ignore
+            elif (
+                hasattr(response, "_hidden_params")
+                and "usage" in response._hidden_params
+            ):
+                litellm_usage_chunk = response._hidden_params["usage"]
+            else:
+                litellm_usage_chunk = None
+            if litellm_usage_chunk is not None:
+                usage_delta = UsageDelta(
+                    input_tokens=litellm_usage_chunk.prompt_tokens or 0,
+                    output_tokens=litellm_usage_chunk.completion_tokens or 0,
+                )
+            else:
+                usage_delta = UsageDelta(input_tokens=0, output_tokens=0)
+            return MessageBlockDelta(
+                type="message_delta", delta=delta, usage=usage_delta
+            )
+        (
+            type_of_content,
+            content_block_delta,
+        ) = self._translate_streaming_openai_chunk_to_anthropic(
+            choices=response.choices  # type: ignore
+        )
+        return ContentBlockDelta(
+            type="content_block_delta",
+            index=response.choices[0].index,
+            delta=content_block_delta,
+        )
+

 # makes headers for API call
 def validate_environment(api_key, user_headers, model):
@ -507,17 +583,23 @@ async def make_call(
    model: str,
    messages: list,
    logging_obj,
+    timeout: Optional[Union[float, httpx.Timeout]],
 ):
    if client is None:
        client = _get_async_httpx_client()  # Create a new client if none provided

    try:
-        response = await client.post(api_base, headers=headers, data=data, stream=True)
+        response = await client.post(
+            api_base, headers=headers, data=data, stream=True, timeout=timeout
+        )
    except httpx.HTTPStatusError as e:
        raise AnthropicError(
            status_code=e.response.status_code, message=await e.response.aread()
        )
    except Exception as e:
+        for exception in litellm.LITELLM_EXCEPTION_TYPES:
+            if isinstance(e, exception):
+                raise e
        raise AnthropicError(status_code=500, message=str(e))

    if response.status_code != 200:
@ -540,6 +622,51 @@ async def make_call(
    return completion_stream


+def make_sync_call(
+    client: Optional[HTTPHandler],
+    api_base: str,
+    headers: dict,
+    data: str,
+    model: str,
+    messages: list,
+    logging_obj,
+    timeout: Optional[Union[float, httpx.Timeout]],
+):
+    if client is None:
+        client = HTTPHandler()  # Create a new client if none provided
+
+    try:
+        response = client.post(
+            api_base, headers=headers, data=data, stream=True, timeout=timeout
+        )
+    except httpx.HTTPStatusError as e:
+        raise AnthropicError(
+            status_code=e.response.status_code, message=e.response.read()
+        )
+    except Exception as e:
+        for exception in litellm.LITELLM_EXCEPTION_TYPES:
+            if isinstance(e, exception):
+                raise e
+        raise AnthropicError(status_code=500, message=str(e))
+
+    if response.status_code != 200:
+        raise AnthropicError(status_code=response.status_code, message=response.read())
+
+    completion_stream = ModelResponseIterator(
+        streaming_response=response.iter_lines(), sync_stream=True
+    )
+
+    # LOGGING
+    logging_obj.post_call(
+        input=messages,
+        api_key="",
+        original_response="first stream response received",
+        additional_args={"complete_input_dict": data},
+    )
+
+    return completion_stream
+
+
 class AnthropicChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()
@ -647,6 +774,7 @@ class AnthropicChatCompletion(BaseLLM):
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
+        timeout: Union[float, httpx.Timeout],
        encoding,
        api_key,
        logging_obj,
@ -659,20 +787,6 @@ class AnthropicChatCompletion(BaseLLM):
        headers={},
    ):
        data["stream"] = True
-        # async_handler = AsyncHTTPHandler(
-        #     timeout=httpx.Timeout(timeout=600.0, connect=20.0)
-        # )
-
-        # response = await async_handler.post(
-        #     api_base, headers=headers, json=data, stream=True
-        # )
-
-        # if response.status_code != 200:
-        #     raise AnthropicError(
-        #         status_code=response.status_code, message=response.text
-        #     )
-
-        # completion_stream = response.aiter_lines()

        streamwrapper = CustomStreamWrapper(
            completion_stream=None,
@ -685,6 +799,7 @@ class AnthropicChatCompletion(BaseLLM):
                model=model,
                messages=messages,
                logging_obj=logging_obj,
+                timeout=timeout,
            ),
            model=model,
            custom_llm_provider="anthropic",
@ -700,6 +815,7 @@ class AnthropicChatCompletion(BaseLLM):
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
+        timeout: Union[float, httpx.Timeout],
        encoding,
        api_key,
        logging_obj,
@ -716,7 +832,9 @@ class AnthropicChatCompletion(BaseLLM):
        async_handler = _get_async_httpx_client()

        try:
-            response = await async_handler.post(api_base, headers=headers, json=data)
+            response = await async_handler.post(
+                api_base, headers=headers, json=data, timeout=timeout
+            )
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
@ -876,6 +994,7 @@ class AnthropicChatCompletion(BaseLLM):
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
+                    timeout=timeout,
                )
            else:
                return self.acompletion_function(
@ -897,43 +1016,40 @@ class AnthropicChatCompletion(BaseLLM):
                    headers=headers,
                    client=client,
                    json_mode=json_mode,
+                    timeout=timeout,
                )
        else:
            ## COMPLETION CALL
-            if client is None or isinstance(client, AsyncHTTPHandler):
+            if client is None or not isinstance(client, HTTPHandler):
                client = HTTPHandler(timeout=timeout)  # type: ignore
            else:
                client = client
            if (
                stream is True
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
-                print_verbose("makes anthropic streaming POST request")
                data["stream"] = stream
-                response = requests.post(
-                    api_base,
-                    headers=headers,
-                    data=json.dumps(data),
-                    stream=stream,
-                )
-
-                if response.status_code != 200:
-                    raise AnthropicError(
-                        status_code=response.status_code, message=response.text
-                    )
-
-                completion_stream = ModelResponseIterator(
-                    streaming_response=response.iter_lines(), sync_stream=True
-                )
-                streaming_response = CustomStreamWrapper(
-                    completion_stream=completion_stream,
+                return CustomStreamWrapper(
+                    completion_stream=None,
+                    make_call=partial(
+                        make_sync_call,
+                        client=None,
+                        api_base=api_base,
+                        headers=headers,  # type: ignore
+                        data=json.dumps(data),
+                        model=model,
+                        messages=messages,
+                        logging_obj=logging_obj,
+                        timeout=timeout,
+                    ),
                    model=model,
                    custom_llm_provider="anthropic",
                    logging_obj=logging_obj,
                )
-                return streaming_response

            else:
-                response = client.post(api_base, headers=headers, data=json.dumps(data))
+                response = client.post(
+                    api_base, headers=headers, data=json.dumps(data), timeout=timeout
+                )
                if response.status_code != 200:
                    raise AnthropicError(
                        status_code=response.status_code, message=response.text
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -474,21 +474,13 @@ class AzureChatCompletion(BaseLLM):
        - call chat.completions.create by default
        """
        try:
-            if litellm.return_response_headers is True:
-                raw_response = (
-                    await azure_client.chat.completions.with_raw_response.create(
-                        **data, timeout=timeout
-                    )
-                )
+            raw_response = await azure_client.chat.completions.with_raw_response.create(
+                **data, timeout=timeout
+            )

-                headers = dict(raw_response.headers)
-                response = raw_response.parse()
-                return headers, response
-            else:
-                response = await azure_client.chat.completions.create(
-                    **data, timeout=timeout
-                )
-                return None, response
+            headers = dict(raw_response.headers)
+            response = raw_response.parse()
+            return headers, response
        except Exception as e:
            raise e

--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -13,6 +13,7 @@ from enum import Enum
 from typing import Any, Callable, List, Optional, Union

 import httpx
+from openai.types.image import Image

 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
@ -1413,10 +1414,10 @@ def embedding(
 def image_generation(
    model: str,
    prompt: str,
+    model_response: ImageResponse,
+    optional_params: dict,
    timeout=None,
    logging_obj=None,
-    model_response=None,
-    optional_params=None,
    aimg_generation=False,
 ):
    """
@ -1513,9 +1514,10 @@ def image_generation(
    if model_response is None:
        model_response = ImageResponse()

-    image_list: List = []
+    image_list: List[Image] = []
    for artifact in response_body["artifacts"]:
-        image_dict = {"url": artifact["base64"]}
+        _image = Image(b64_json=artifact["base64"])
+        image_list.append(_image)

-    model_response.data = image_dict
+    model_response.data = image_list
    return model_response
--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -42,8 +42,11 @@ from litellm.types.llms.openai import (
    ChatCompletionResponseMessage,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionUsageBlock,
 )
-from litellm.types.utils import Choices, Message
+from litellm.types.utils import Choices
+from litellm.types.utils import GenericStreamingChunk as GChunk
+from litellm.types.utils import Message
 from litellm.utils import (
    CustomStreamWrapper,
    ModelResponse,
@ -78,6 +81,7 @@ BEDROCK_CONVERSE_MODELS = [
    "ai21.jamba-instruct-v1:0",
    "meta.llama3-1-8b-instruct-v1:0",
    "meta.llama3-1-70b-instruct-v1:0",
+    "meta.llama3-1-405b-instruct-v1:0",
    "mistral.mistral-large-2407-v1:0",
 ]

@ -244,7 +248,7 @@ async def make_call(
        return completion_stream
    except httpx.HTTPStatusError as err:
        error_code = err.response.status_code
-        raise BedrockError(status_code=error_code, message=str(err))
+        raise BedrockError(status_code=error_code, message=err.response.text)
    except httpx.TimeoutException as e:
        raise BedrockError(status_code=408, message="Timeout error occurred.")
    except Exception as e:
@ -382,6 +386,7 @@ class BedrockLLM(BaseLLM):
        aws_profile_name: Optional[str] = None,
        aws_role_name: Optional[str] = None,
        aws_web_identity_token: Optional[str] = None,
+        aws_sts_endpoint: Optional[str] = None,
    ):
        """
        Return a boto3.Credentials object
@ -402,6 +407,7 @@ class BedrockLLM(BaseLLM):
            aws_profile_name,
            aws_role_name,
            aws_web_identity_token,
+            aws_sts_endpoint,
        ]

        # Iterate over parameters and update if needed
@ -420,6 +426,7 @@ class BedrockLLM(BaseLLM):
            aws_profile_name,
            aws_role_name,
            aws_web_identity_token,
+            aws_sts_endpoint,
        ) = params_to_check

        ### CHECK STS ###
@ -431,12 +438,19 @@ class BedrockLLM(BaseLLM):
            print_verbose(
                f"IN Web Identity Token: {aws_web_identity_token} | Role Name: {aws_role_name} | Session Name: {aws_session_name}"
            )
+
+            if aws_sts_endpoint is None:
+                sts_endpoint = f"https://sts.{aws_region_name}.amazonaws.com"
+            else:
+                sts_endpoint = aws_sts_endpoint
+
            iam_creds_cache_key = json.dumps(
                {
                    "aws_web_identity_token": aws_web_identity_token,
                    "aws_role_name": aws_role_name,
                    "aws_session_name": aws_session_name,
                    "aws_region_name": aws_region_name,
+                    "aws_sts_endpoint": sts_endpoint,
                }
            )

@ -453,7 +467,7 @@ class BedrockLLM(BaseLLM):
                sts_client = boto3.client(
                    "sts",
                    region_name=aws_region_name,
-                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
+                    endpoint_url=sts_endpoint,
                )

                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -848,6 +862,7 @@ class BedrockLLM(BaseLLM):
            "aws_bedrock_runtime_endpoint", None
        )  # https://bedrock-runtime.{region_name}.amazonaws.com
        aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
+        aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)

        ### SET REGION NAME ###
        if aws_region_name is None:
@ -877,6 +892,7 @@ class BedrockLLM(BaseLLM):
            aws_profile_name=aws_profile_name,
            aws_role_name=aws_role_name,
            aws_web_identity_token=aws_web_identity_token,
+            aws_sts_endpoint=aws_sts_endpoint,
        )

        ### SET RUNTIME ENDPOINT ###
@ -1535,6 +1551,7 @@ class BedrockConverseLLM(BaseLLM):
        aws_profile_name: Optional[str] = None,
        aws_role_name: Optional[str] = None,
        aws_web_identity_token: Optional[str] = None,
+        aws_sts_endpoint: Optional[str] = None,
    ):
        """
        Return a boto3.Credentials object
@ -1551,6 +1568,7 @@ class BedrockConverseLLM(BaseLLM):
            aws_profile_name,
            aws_role_name,
            aws_web_identity_token,
+            aws_sts_endpoint,
        ]

        # Iterate over parameters and update if needed
@ -1569,6 +1587,7 @@ class BedrockConverseLLM(BaseLLM):
            aws_profile_name,
            aws_role_name,
            aws_web_identity_token,
+            aws_sts_endpoint,
        ) = params_to_check

        ### CHECK STS ###
@ -1577,12 +1596,22 @@ class BedrockConverseLLM(BaseLLM):
            and aws_role_name is not None
            and aws_session_name is not None
        ):
+            print_verbose(
+                f"IN Web Identity Token: {aws_web_identity_token} | Role Name: {aws_role_name} | Session Name: {aws_session_name}"
+            )
+
+            if aws_sts_endpoint is None:
+                sts_endpoint = f"https://sts.{aws_region_name}.amazonaws.com"
+            else:
+                sts_endpoint = aws_sts_endpoint
+
            iam_creds_cache_key = json.dumps(
                {
                    "aws_web_identity_token": aws_web_identity_token,
                    "aws_role_name": aws_role_name,
                    "aws_session_name": aws_session_name,
                    "aws_region_name": aws_region_name,
+                    "aws_sts_endpoint": sts_endpoint,
                }
            )

@ -1599,7 +1628,7 @@ class BedrockConverseLLM(BaseLLM):
                sts_client = boto3.client(
                    "sts",
                    region_name=aws_region_name,
-                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
+                    endpoint_url=sts_endpoint,
                )

                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -1814,6 +1843,7 @@ class BedrockConverseLLM(BaseLLM):
            "aws_bedrock_runtime_endpoint", None
        )  # https://bedrock-runtime.{region_name}.amazonaws.com
        aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
+        aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)

        ### SET REGION NAME ###
        if aws_region_name is None:
@ -1843,6 +1873,7 @@ class BedrockConverseLLM(BaseLLM):
            aws_profile_name=aws_profile_name,
            aws_role_name=aws_role_name,
            aws_web_identity_token=aws_web_identity_token,
+            aws_sts_endpoint=aws_sts_endpoint,
        )

        ### SET RUNTIME ENDPOINT ###
@ -1888,12 +1919,14 @@ class BedrockConverseLLM(BaseLLM):
        additional_request_params = {}
        supported_converse_params = AmazonConverseConfig.__annotations__.keys()
        supported_tool_call_params = ["tools", "tool_choice"]
+        supported_guardrail_params = ["guardrailConfig"]
        ## TRANSFORMATION ##
        # send all model-specific params in 'additional_request_params'
        for k, v in inference_params.items():
            if (
                k not in supported_converse_params
                and k not in supported_tool_call_params
+                and k not in supported_guardrail_params
            ):
                additional_request_params[k] = v
                additional_request_keys.append(k)
@ -1925,6 +1958,15 @@ class BedrockConverseLLM(BaseLLM):
            "system": system_content_blocks,
            "inferenceConfig": InferenceConfig(**inference_params),
        }
+
+        # Guardrail Config
+        guardrail_config: Optional[GuardrailConfigBlock] = None
+        request_guardrails_config = inference_params.pop("guardrailConfig", None)
+        if request_guardrails_config is not None:
+            guardrail_config = GuardrailConfigBlock(**request_guardrails_config)
+            _data["guardrailConfig"] = guardrail_config
+
+        # Tool Config
        if bedrock_tool_config is not None:
            _data["toolConfig"] = bedrock_tool_config
        data = json.dumps(_data)
@ -2068,13 +2110,13 @@ class AWSEventStreamDecoder:
        self.model = model
        self.parser = EventStreamJSONParser()

-    def converse_chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
+    def converse_chunk_parser(self, chunk_data: dict) -> GChunk:
        try:
            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
            is_finished = False
            finish_reason = ""
-            usage: Optional[ConverseTokenUsageBlock] = None
+            usage: Optional[ChatCompletionUsageBlock] = None

            index = int(chunk_data.get("contentBlockIndex", 0))
            if "start" in chunk_data:
@ -2111,9 +2153,13 @@ class AWSEventStreamDecoder:
                finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop"))
                is_finished = True
            elif "usage" in chunk_data:
-                usage = ConverseTokenUsageBlock(**chunk_data["usage"])  # type: ignore
+                usage = ChatCompletionUsageBlock(
+                    prompt_tokens=chunk_data.get("inputTokens", 0),
+                    completion_tokens=chunk_data.get("outputTokens", 0),
+                    total_tokens=chunk_data.get("totalTokens", 0),
+                )

-            response = GenericStreamingChunk(
+            response = GChunk(
                text=text,
                tool_use=tool_use,
                is_finished=is_finished,
@ -2125,7 +2171,7 @@ class AWSEventStreamDecoder:
        except Exception as e:
            raise Exception("Received streaming error - {}".format(str(e)))

-    def _chunk_parser(self, chunk_data: dict) -> GenericStreamingChunk:
+    def _chunk_parser(self, chunk_data: dict) -> GChunk:
        text = ""
        is_finished = False
        finish_reason = ""
@ -2168,7 +2214,7 @@ class AWSEventStreamDecoder:
        elif chunk_data.get("completionReason", None):
            is_finished = True
            finish_reason = chunk_data["completionReason"]
-        return GenericStreamingChunk(
+        return GChunk(
            text=text,
            is_finished=is_finished,
            finish_reason=finish_reason,
@ -2177,7 +2223,7 @@ class AWSEventStreamDecoder:
            tool_use=None,
        )

-    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GenericStreamingChunk]:
+    def iter_bytes(self, iterator: Iterator[bytes]) -> Iterator[GChunk]:
        """Given an iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer

@ -2193,7 +2239,7 @@ class AWSEventStreamDecoder:

    async def aiter_bytes(
        self, iterator: AsyncIterator[bytes]
-    ) -> AsyncIterator[GenericStreamingChunk]:
+    ) -> AsyncIterator[GChunk]:
        """Given an async iterator that yields lines, iterate over it & yield every event encountered"""
        from botocore.eventstream import EventStreamBuffer

@ -2233,20 +2279,16 @@ class MockResponseIterator:  # for returning ai21 streaming responses
    def __iter__(self):
        return self

-    def _chunk_parser(self, chunk_data: ModelResponse) -> GenericStreamingChunk:
+    def _chunk_parser(self, chunk_data: ModelResponse) -> GChunk:

        try:
            chunk_usage: litellm.Usage = getattr(chunk_data, "usage")
-            processed_chunk = GenericStreamingChunk(
+            processed_chunk = GChunk(
                text=chunk_data.choices[0].message.content or "",  # type: ignore
                tool_use=None,
                is_finished=True,
                finish_reason=chunk_data.choices[0].finish_reason,  # type: ignore
-                usage=ConverseTokenUsageBlock(
-                    inputTokens=chunk_usage.prompt_tokens,
-                    outputTokens=chunk_usage.completion_tokens,
-                    totalTokens=chunk_usage.total_tokens,
-                ),
+                usage=chunk_usage,  # type: ignore
                index=0,
            )
            return processed_chunk
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -1,15 +1,20 @@
+#################### OLD ########################
+##### See `cohere_chat.py` for `/chat` calls ####
+#################################################
 import json
 import os
 import time
 import traceback
 import types
 from enum import Enum
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Union

 import httpx  # type: ignore
 import requests  # type: ignore

 import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import Choices, Message, ModelResponse, Usage


@ -246,14 +251,98 @@ def completion(
        return model_response


+def _process_embedding_response(
+    embeddings: list,
+    model_response: litellm.EmbeddingResponse,
+    model: str,
+    encoding: Any,
+    input: list,
+) -> litellm.EmbeddingResponse:
+    output_data = []
+    for idx, embedding in enumerate(embeddings):
+        output_data.append(
+            {"object": "embedding", "index": idx, "embedding": embedding}
+        )
+    model_response.object = "list"
+    model_response.data = output_data
+    model_response.model = model
+    input_tokens = 0
+    for text in input:
+        input_tokens += len(encoding.encode(text))
+
+    setattr(
+        model_response,
+        "usage",
+        Usage(
+            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+        ),
+    )
+
+    return model_response
+
+
+async def async_embedding(
+    model: str,
+    data: dict,
+    input: list,
+    model_response: litellm.utils.EmbeddingResponse,
+    timeout: Union[float, httpx.Timeout],
+    logging_obj: LiteLLMLoggingObj,
+    optional_params: dict,
+    api_base: str,
+    api_key: Optional[str],
+    headers: dict,
+    encoding: Callable,
+    client: Optional[AsyncHTTPHandler] = None,
+):
+
+    ## LOGGING
+    logging_obj.pre_call(
+        input=input,
+        api_key=api_key,
+        additional_args={
+            "complete_input_dict": data,
+            "headers": headers,
+            "api_base": api_base,
+        },
+    )
+    ## COMPLETION CALL
+    if client is None:
+        client = AsyncHTTPHandler(concurrent_limit=1)
+
+    response = await client.post(api_base, headers=headers, data=json.dumps(data))
+
+    ## LOGGING
+    logging_obj.post_call(
+        input=input,
+        api_key=api_key,
+        additional_args={"complete_input_dict": data},
+        original_response=response,
+    )
+
+    embeddings = response.json()["embeddings"]
+
+    ## PROCESS RESPONSE ##
+    return _process_embedding_response(
+        embeddings=embeddings,
+        model_response=model_response,
+        model=model,
+        encoding=encoding,
+        input=input,
+    )
+
+
 def embedding(
    model: str,
    input: list,
    model_response: litellm.EmbeddingResponse,
+    logging_obj: LiteLLMLoggingObj,
+    optional_params: dict,
+    encoding: Any,
    api_key: Optional[str] = None,
-    logging_obj=None,
-    encoding=None,
-    optional_params=None,
+    aembedding: Optional[bool] = None,
+    timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
+    client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
 ):
    headers = validate_environment(api_key)
    embed_url = "https://api.cohere.ai/v1/embed"
@ -270,8 +359,26 @@ def embedding(
        api_key=api_key,
        additional_args={"complete_input_dict": data},
    )
+
+    ## ROUTING
+    if aembedding is True:
+        return async_embedding(
+            model=model,
+            data=data,
+            input=input,
+            model_response=model_response,
+            timeout=timeout,
+            logging_obj=logging_obj,
+            optional_params=optional_params,
+            api_base=embed_url,
+            api_key=api_key,
+            headers=headers,
+            encoding=encoding,
+        )
    ## COMPLETION CALL
-    response = requests.post(embed_url, headers=headers, data=json.dumps(data))
+    if client is None or not isinstance(client, HTTPHandler):
+        client = HTTPHandler(concurrent_limit=1)
+    response = client.post(embed_url, headers=headers, data=json.dumps(data))
    ## LOGGING
    logging_obj.post_call(
        input=input,
@ -293,23 +400,11 @@ def embedding(
    if response.status_code != 200:
        raise CohereError(message=response.text, status_code=response.status_code)
    embeddings = response.json()["embeddings"]
-    output_data = []
-    for idx, embedding in enumerate(embeddings):
-        output_data.append(
-            {"object": "embedding", "index": idx, "embedding": embedding}
-        )
-    model_response.object = "list"
-    model_response.data = output_data
-    model_response.model = model
-    input_tokens = 0
-    for text in input:
-        input_tokens += len(encoding.encode(text))

-    setattr(
-        model_response,
-        "usage",
-        Usage(
-            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
-        ),
+    return _process_embedding_response(
+        embeddings=embeddings,
+        model_response=model_response,
+        model=model,
+        encoding=encoding,
+        input=input,
    )
-    return model_response
--- a/litellm/llms/cohere_chat.py
+++ b/litellm/llms/cohere_chat.py
@ -233,8 +233,14 @@ def completion(
        optional_params["tool_results"] = [most_recent_message]
    elif isinstance(most_recent_message, str):
        optional_params["message"] = most_recent_message
+
+    ## check if chat history message is 'user' and 'tool_results' is given -> force_single_step=True, else cohere api fails
+    if len(chat_history) > 0 and chat_history[-1]["role"] == "USER":
+        optional_params["force_single_step"] = True
+
    data = {
        "model": model,
+        "chat_history": chat_history,
        **optional_params,
    }

--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -80,18 +80,77 @@ class AsyncHTTPHandler:
        json: Optional[dict] = None,
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
        stream: bool = False,
    ):
        try:
+            if timeout is None:
+                timeout = self.timeout
            req = self.client.build_request(
-                "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
+                "POST", url, data=data, json=json, params=params, headers=headers, timeout=timeout  # type: ignore
            )
            response = await self.client.send(req, stream=stream)
            response.raise_for_status()
            return response
        except (httpx.RemoteProtocolError, httpx.ConnectError):
            # Retry the request with a new session if there is a connection error
-            new_client = self.create_client(timeout=self.timeout, concurrent_limit=1)
+            new_client = self.create_client(timeout=timeout, concurrent_limit=1)
+            try:
+                return await self.single_connection_post_request(
+                    url=url,
+                    client=new_client,
+                    data=data,
+                    json=json,
+                    params=params,
+                    headers=headers,
+                    stream=stream,
+                )
+            finally:
+                await new_client.aclose()
+        except httpx.TimeoutException as e:
+            headers = {}
+            if hasattr(e, "response") and e.response is not None:
+                for key, value in e.response.headers.items():
+                    headers["response_headers-{}".format(key)] = value
+
+            raise litellm.Timeout(
+                message=f"Connection timed out after {timeout} seconds.",
+                model="default-model-name",
+                llm_provider="litellm-httpx-handler",
+                headers=headers,
+            )
+        except httpx.HTTPStatusError as e:
+            setattr(e, "status_code", e.response.status_code)
+            if stream is True:
+                setattr(e, "message", await e.response.aread())
+            else:
+                setattr(e, "message", e.response.text)
+            raise e
+        except Exception as e:
+            raise e
+
+    async def delete(
+        self,
+        url: str,
+        data: Optional[Union[dict, str]] = None,  # type: ignore
+        json: Optional[dict] = None,
+        params: Optional[dict] = None,
+        headers: Optional[dict] = None,
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
+        stream: bool = False,
+    ):
+        try:
+            if timeout is None:
+                timeout = self.timeout
+            req = self.client.build_request(
+                "DELETE", url, data=data, json=json, params=params, headers=headers, timeout=timeout  # type: ignore
+            )
+            response = await self.client.send(req, stream=stream)
+            response.raise_for_status()
+            return response
+        except (httpx.RemoteProtocolError, httpx.ConnectError):
+            # Retry the request with a new session if there is a connection error
+            new_client = self.create_client(timeout=timeout, concurrent_limit=1)
            try:
                return await self.single_connection_post_request(
                    url=url,
@ -192,13 +251,28 @@ class HTTPHandler:
        params: Optional[dict] = None,
        headers: Optional[dict] = None,
        stream: bool = False,
+        timeout: Optional[Union[float, httpx.Timeout]] = None,
    ):
+        try:

-        req = self.client.build_request(
-            "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
-        )
-        response = self.client.send(req, stream=stream)
-        return response
+            if timeout is not None:
+                req = self.client.build_request(
+                    "POST", url, data=data, json=json, params=params, headers=headers, timeout=timeout  # type: ignore
+                )
+            else:
+                req = self.client.build_request(
+                    "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
+                )
+            response = self.client.send(req, stream=stream)
+            return response
+        except httpx.TimeoutException:
+            raise litellm.Timeout(
+                message=f"Connection timed out after {timeout} seconds.",
+                model="default-model-name",
+                llm_provider="litellm-httpx-handler",
+            )
+        except Exception as e:
+            raise e

    def __del__(self) -> None:
        try:
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -15,8 +15,14 @@ import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.types.llms.databricks import GenericStreamingChunk
-from litellm.types.utils import ProviderField
+from litellm.types.llms.openai import (
+    ChatCompletionDeltaChunk,
+    ChatCompletionResponseMessage,
+    ChatCompletionToolCallChunk,
+    ChatCompletionToolCallFunctionChunk,
+    ChatCompletionUsageBlock,
+)
+from litellm.types.utils import GenericStreamingChunk, ProviderField
 from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage

 from .base import BaseLLM
@ -114,71 +120,6 @@ class DatabricksConfig:
                optional_params["stop"] = value
        return optional_params

-    def _chunk_parser(self, chunk_data: str) -> GenericStreamingChunk:
-        try:
-            text = ""
-            is_finished = False
-            finish_reason = None
-            logprobs = None
-            usage = None
-            original_chunk = None  # this is used for function/tool calling
-            chunk_data = chunk_data.replace("data:", "")
-            chunk_data = chunk_data.strip()
-            if len(chunk_data) == 0 or chunk_data == "[DONE]":
-                return {
-                    "text": "",
-                    "is_finished": is_finished,
-                    "finish_reason": finish_reason,
-                }
-            chunk_data_dict = json.loads(chunk_data)
-            str_line = litellm.ModelResponse(**chunk_data_dict, stream=True)
-
-            if len(str_line.choices) > 0:
-                if (
-                    str_line.choices[0].delta is not None  # type: ignore
-                    and str_line.choices[0].delta.content is not None  # type: ignore
-                ):
-                    text = str_line.choices[0].delta.content  # type: ignore
-                else:  # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
-                    original_chunk = str_line
-                if str_line.choices[0].finish_reason:
-                    is_finished = True
-                    finish_reason = str_line.choices[0].finish_reason
-                    if finish_reason == "content_filter":
-                        if hasattr(str_line.choices[0], "content_filter_result"):
-                            error_message = json.dumps(
-                                str_line.choices[0].content_filter_result  # type: ignore
-                            )
-                        else:
-                            error_message = "Azure Response={}".format(
-                                str(dict(str_line))
-                            )
-                        raise litellm.AzureOpenAIError(
-                            status_code=400, message=error_message
-                        )
-
-                # checking for logprobs
-                if (
-                    hasattr(str_line.choices[0], "logprobs")
-                    and str_line.choices[0].logprobs is not None
-                ):
-                    logprobs = str_line.choices[0].logprobs
-                else:
-                    logprobs = None
-
-            usage = getattr(str_line, "usage", None)
-
-            return GenericStreamingChunk(
-                text=text,
-                is_finished=is_finished,
-                finish_reason=finish_reason,
-                logprobs=logprobs,
-                original_chunk=original_chunk,
-                usage=usage,
-            )
-        except Exception as e:
-            raise e
-

 class DatabricksEmbeddingConfig:
    """
@ -236,7 +177,9 @@ async def make_call(
    if response.status_code != 200:
        raise DatabricksError(status_code=response.status_code, message=response.text)

-    completion_stream = response.aiter_lines()
+    completion_stream = ModelResponseIterator(
+        streaming_response=response.aiter_lines(), sync_stream=False
+    )
    # LOGGING
    logging_obj.post_call(
        input=messages,
@ -248,6 +191,38 @@ async def make_call(
    return completion_stream


+def make_sync_call(
+    client: Optional[HTTPHandler],
+    api_base: str,
+    headers: dict,
+    data: str,
+    model: str,
+    messages: list,
+    logging_obj,
+):
+    if client is None:
+        client = HTTPHandler()  # Create a new client if none provided
+
+    response = client.post(api_base, headers=headers, data=data, stream=True)
+
+    if response.status_code != 200:
+        raise DatabricksError(status_code=response.status_code, message=response.read())
+
+    completion_stream = ModelResponseIterator(
+        streaming_response=response.iter_lines(), sync_stream=True
+    )
+
+    # LOGGING
+    logging_obj.post_call(
+        input=messages,
+        api_key="",
+        original_response="first stream response received",
+        additional_args={"complete_input_dict": data},
+    )
+
+    return completion_stream
+
+
 class DatabricksChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()
@ -259,6 +234,7 @@ class DatabricksChatCompletion(BaseLLM):
        api_key: Optional[str],
        api_base: Optional[str],
        endpoint_type: Literal["chat_completions", "embeddings"],
+        custom_endpoint: Optional[bool],
    ) -> Tuple[str, dict]:
        if api_key is None:
            raise DatabricksError(
@ -277,97 +253,17 @@ class DatabricksChatCompletion(BaseLLM):
            "Content-Type": "application/json",
        }

-        if endpoint_type == "chat_completions":
+        if endpoint_type == "chat_completions" and custom_endpoint is not True:
            api_base = "{}/chat/completions".format(api_base)
-        elif endpoint_type == "embeddings":
+        elif endpoint_type == "embeddings" and custom_endpoint is not True:
            api_base = "{}/embeddings".format(api_base)
        return api_base, headers

-    def process_response(
-        self,
-        model: str,
-        response: Union[requests.Response, httpx.Response],
-        model_response: ModelResponse,
-        stream: bool,
-        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
-        optional_params: dict,
-        api_key: str,
-        data: Union[dict, str],
-        messages: List,
-        print_verbose,
-        encoding,
-    ) -> ModelResponse:
-        ## LOGGING
-        logging_obj.post_call(
-            input=messages,
-            api_key=api_key,
-            original_response=response.text,
-            additional_args={"complete_input_dict": data},
-        )
-        print_verbose(f"raw model_response: {response.text}")
-        ## RESPONSE OBJECT
-        try:
-            completion_response = response.json()
-        except:
-            raise DatabricksError(
-                message=response.text, status_code=response.status_code
-            )
-        if "error" in completion_response:
-            raise DatabricksError(
-                message=str(completion_response["error"]),
-                status_code=response.status_code,
-            )
-        else:
-            text_content = ""
-            tool_calls = []
-            for content in completion_response["content"]:
-                if content["type"] == "text":
-                    text_content += content["text"]
-                ## TOOL CALLING
-                elif content["type"] == "tool_use":
-                    tool_calls.append(
-                        {
-                            "id": content["id"],
-                            "type": "function",
-                            "function": {
-                                "name": content["name"],
-                                "arguments": json.dumps(content["input"]),
-                            },
-                        }
-                    )
-
-            _message = litellm.Message(
-                tool_calls=tool_calls,
-                content=text_content or None,
-            )
-            model_response.choices[0].message = _message  # type: ignore
-            model_response._hidden_params["original_response"] = completion_response[
-                "content"
-            ]  # allow user to access raw anthropic tool calling response
-
-            model_response.choices[0].finish_reason = map_finish_reason(
-                completion_response["stop_reason"]
-            )
-
-        ## CALCULATING USAGE
-        prompt_tokens = completion_response["usage"]["input_tokens"]
-        completion_tokens = completion_response["usage"]["output_tokens"]
-        total_tokens = prompt_tokens + completion_tokens
-
-        model_response.created = int(time.time())
-        model_response.model = model
-        usage = Usage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=total_tokens,
-        )
-        setattr(model_response, "usage", usage)  # type: ignore
-        return model_response
-
    async def acompletion_stream_function(
        self,
        model: str,
        messages: list,
+        custom_llm_provider: str,
        api_base: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
@ -397,7 +293,7 @@ class DatabricksChatCompletion(BaseLLM):
                logging_obj=logging_obj,
            ),
            model=model,
-            custom_llm_provider="databricks",
+            custom_llm_provider=custom_llm_provider,
            logging_obj=logging_obj,
        )
        return streamwrapper
@ -415,6 +311,7 @@ class DatabricksChatCompletion(BaseLLM):
        logging_obj,
        stream,
        data: dict,
+        base_model: Optional[str],
        optional_params: dict,
        litellm_params=None,
        logger_fn=None,
@ -436,20 +333,25 @@ class DatabricksChatCompletion(BaseLLM):
        except httpx.HTTPStatusError as e:
            raise DatabricksError(
                status_code=e.response.status_code,
-                message=response.text if response else str(e),
+                message=e.response.text,
            )
        except httpx.TimeoutException as e:
            raise DatabricksError(status_code=408, message="Timeout error occurred.")
        except Exception as e:
            raise DatabricksError(status_code=500, message=str(e))

-        return ModelResponse(**response_json)
+        response = ModelResponse(**response_json)
+
+        if base_model is not None:
+            response._hidden_params["model"] = base_model
+        return response

    def completion(
        self,
        model: str,
        messages: list,
        api_base: str,
+        custom_llm_provider: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
@ -464,8 +366,13 @@ class DatabricksChatCompletion(BaseLLM):
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
    ):
+        custom_endpoint: Optional[bool] = optional_params.pop("custom_endpoint", None)
+        base_model: Optional[str] = optional_params.pop("base_model", None)
        api_base, headers = self._validate_environment(
-            api_base=api_base, api_key=api_key, endpoint_type="chat_completions"
+            api_base=api_base,
+            api_key=api_key,
+            endpoint_type="chat_completions",
+            custom_endpoint=custom_endpoint,
        )
        ## Load Config
        config = litellm.DatabricksConfig().get_config()
@ -475,7 +382,8 @@ class DatabricksChatCompletion(BaseLLM):
            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
                optional_params[k] = v

-        stream = optional_params.pop("stream", None)
+        stream: bool = optional_params.pop("stream", None) or False
+        optional_params["stream"] = stream

        data = {
            "model": model,
@ -493,11 +401,11 @@ class DatabricksChatCompletion(BaseLLM):
                "headers": headers,
            },
        )
-        if acompletion == True:
+        if acompletion is True:
            if client is not None and isinstance(client, HTTPHandler):
                client = None
            if (
-                stream is not None and stream == True
+                stream is not None and stream is True
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                print_verbose("makes async anthropic streaming POST request")
                data["stream"] = stream
@ -518,6 +426,7 @@ class DatabricksChatCompletion(BaseLLM):
                    logger_fn=logger_fn,
                    headers=headers,
                    client=client,
+                    custom_llm_provider=custom_llm_provider,
                )
            else:
                return self.acompletion_function(
@ -537,46 +446,32 @@ class DatabricksChatCompletion(BaseLLM):
                    logger_fn=logger_fn,
                    headers=headers,
                    timeout=timeout,
+                    base_model=base_model,
                )
        else:
-            if client is None or isinstance(client, AsyncHTTPHandler):
-                self.client = HTTPHandler(timeout=timeout)  # type: ignore
-            else:
-                self.client = client
+            if client is None or not isinstance(client, HTTPHandler):
+                client = HTTPHandler(timeout=timeout)  # type: ignore
            ## COMPLETION CALL
-            if (
-                stream is not None and stream == True
-            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
-                print_verbose("makes dbrx streaming POST request")
-                data["stream"] = stream
-                try:
-                    response = self.client.post(
-                        api_base, headers=headers, data=json.dumps(data), stream=stream
-                    )
-                    response.raise_for_status()
-                    completion_stream = response.iter_lines()
-                except httpx.HTTPStatusError as e:
-                    raise DatabricksError(
-                        status_code=e.response.status_code, message=response.text
-                    )
-                except httpx.TimeoutException as e:
-                    raise DatabricksError(
-                        status_code=408, message="Timeout error occurred."
-                    )
-                except Exception as e:
-                    raise DatabricksError(status_code=408, message=str(e))
-
-                streaming_response = CustomStreamWrapper(
-                    completion_stream=completion_stream,
+            if stream is True:
+                return CustomStreamWrapper(
+                    completion_stream=None,
+                    make_call=partial(
+                        make_sync_call,
+                        client=None,
+                        api_base=api_base,
+                        headers=headers,  # type: ignore
+                        data=json.dumps(data),
+                        model=model,
+                        messages=messages,
+                        logging_obj=logging_obj,
+                    ),
                    model=model,
-                    custom_llm_provider="databricks",
+                    custom_llm_provider=custom_llm_provider,
                    logging_obj=logging_obj,
                )
-                return streaming_response
-
            else:
                try:
-                    response = self.client.post(
+                    response = client.post(
                        api_base, headers=headers, data=json.dumps(data)
                    )
                    response.raise_for_status()
@ -593,7 +488,12 @@ class DatabricksChatCompletion(BaseLLM):
                except Exception as e:
                    raise DatabricksError(status_code=500, message=str(e))

-        return ModelResponse(**response_json)
+        response = ModelResponse(**response_json)
+
+        if base_model is not None:
+            response._hidden_params["model"] = base_model
+
+        return response

    async def aembedding(
        self,
@ -667,7 +567,10 @@ class DatabricksChatCompletion(BaseLLM):
        aembedding=None,
    ) -> EmbeddingResponse:
        api_base, headers = self._validate_environment(
-            api_base=api_base, api_key=api_key, endpoint_type="embeddings"
+            api_base=api_base,
+            api_key=api_key,
+            endpoint_type="embeddings",
+            custom_endpoint=False,
        )
        model = model
        data = {"model": model, "input": input, **optional_params}
@ -716,3 +619,128 @@ class DatabricksChatCompletion(BaseLLM):
        )

        return litellm.EmbeddingResponse(**response_json)
+
+
+class ModelResponseIterator:
+    def __init__(self, streaming_response, sync_stream: bool):
+        self.streaming_response = streaming_response
+
+    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
+        try:
+            processed_chunk = litellm.ModelResponse(**chunk, stream=True)  # type: ignore
+
+            text = ""
+            tool_use: Optional[ChatCompletionToolCallChunk] = None
+            is_finished = False
+            finish_reason = ""
+            usage: Optional[ChatCompletionUsageBlock] = None
+
+            if processed_chunk.choices[0].delta.content is not None:  # type: ignore
+                text = processed_chunk.choices[0].delta.content  # type: ignore
+
+            if (
+                processed_chunk.choices[0].delta.tool_calls is not None  # type: ignore
+                and len(processed_chunk.choices[0].delta.tool_calls) > 0  # type: ignore
+                and processed_chunk.choices[0].delta.tool_calls[0].function is not None  # type: ignore
+                and processed_chunk.choices[0].delta.tool_calls[0].function.arguments  # type: ignore
+                is not None
+            ):
+                tool_use = ChatCompletionToolCallChunk(
+                    id=processed_chunk.choices[0].delta.tool_calls[0].id,  # type: ignore
+                    type="function",
+                    function=ChatCompletionToolCallFunctionChunk(
+                        name=processed_chunk.choices[0]
+                        .delta.tool_calls[0]  # type: ignore
+                        .function.name,
+                        arguments=processed_chunk.choices[0]
+                        .delta.tool_calls[0]  # type: ignore
+                        .function.arguments,
+                    ),
+                    index=processed_chunk.choices[0].index,
+                )
+
+            if processed_chunk.choices[0].finish_reason is not None:
+                is_finished = True
+                finish_reason = processed_chunk.choices[0].finish_reason
+
+            if hasattr(processed_chunk, "usage"):
+                usage = processed_chunk.usage  # type: ignore
+
+            return GenericStreamingChunk(
+                text=text,
+                tool_use=tool_use,
+                is_finished=is_finished,
+                finish_reason=finish_reason,
+                usage=usage,
+                index=0,
+            )
+        except json.JSONDecodeError:
+            raise ValueError(f"Failed to decode JSON from chunk: {chunk}")
+
+    # Sync iterator
+    def __iter__(self):
+        self.response_iterator = self.streaming_response
+        return self
+
+    def __next__(self):
+        try:
+            chunk = self.response_iterator.__next__()
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            chunk = chunk.replace("data:", "")
+            chunk = chunk.strip()
+            if len(chunk) > 0:
+                json_chunk = json.loads(chunk)
+                return self.chunk_parser(chunk=json_chunk)
+            else:
+                return GenericStreamingChunk(
+                    text="",
+                    is_finished=False,
+                    finish_reason="",
+                    usage=None,
+                    index=0,
+                    tool_use=None,
+                )
+        except StopIteration:
+            raise StopIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+
+    # Async iterator
+    def __aiter__(self):
+        self.async_response_iterator = self.streaming_response.__aiter__()
+        return self
+
+    async def __anext__(self):
+        try:
+            chunk = await self.async_response_iterator.__anext__()
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+        try:
+            chunk = chunk.replace("data:", "")
+            chunk = chunk.strip()
+            if chunk == "[DONE]":
+                raise StopAsyncIteration
+            if len(chunk) > 0:
+                json_chunk = json.loads(chunk)
+                return self.chunk_parser(chunk=json_chunk)
+            else:
+                return GenericStreamingChunk(
+                    text="",
+                    is_finished=False,
+                    finish_reason="",
+                    usage=None,
+                    index=0,
+                    tool_use=None,
+                )
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+        except ValueError as e:
+            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
--- a/litellm/llms/files_apis/azure.py
+++ b/litellm/llms/files_apis/azure.py
@ -0,0 +1,315 @@
+from typing import Any, Coroutine, Dict, List, Optional, Union
+
+import httpx
+from openai import AsyncAzureOpenAI, AzureOpenAI
+from openai.types.file_deleted import FileDeleted
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.llms.base import BaseLLM
+from litellm.types.llms.openai import *
+
+
+def get_azure_openai_client(
+    api_key: Optional[str],
+    api_base: Optional[str],
+    timeout: Union[float, httpx.Timeout],
+    max_retries: Optional[int],
+    api_version: Optional[str] = None,
+    organization: Optional[str] = None,
+    client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    _is_async: bool = False,
+) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]:
+    received_args = locals()
+    openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None
+    if client is None:
+        data = {}
+        for k, v in received_args.items():
+            if k == "self" or k == "client" or k == "_is_async":
+                pass
+            elif k == "api_base" and v is not None:
+                data["azure_endpoint"] = v
+            elif v is not None:
+                data[k] = v
+        if "api_version" not in data:
+            data["api_version"] = litellm.AZURE_DEFAULT_API_VERSION
+        if _is_async is True:
+            openai_client = AsyncAzureOpenAI(**data)
+        else:
+            openai_client = AzureOpenAI(**data)  # type: ignore
+    else:
+        openai_client = client
+
+    return openai_client
+
+
+class AzureOpenAIFilesAPI(BaseLLM):
+    """
+    AzureOpenAI methods to support for batches
+    - create_file()
+    - retrieve_file()
+    - list_files()
+    - delete_file()
+    - file_content()
+    - update_file()
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def acreate_file(
+        self,
+        create_file_data: CreateFileRequest,
+        openai_client: AsyncAzureOpenAI,
+    ) -> FileObject:
+        verbose_logger.debug("create_file_data=%s", create_file_data)
+        response = await openai_client.files.create(**create_file_data)
+        verbose_logger.debug("create_file_response=%s", response)
+        return response
+
+    def create_file(
+        self,
+        _is_async: bool,
+        create_file_data: CreateFileRequest,
+        api_base: str,
+        api_key: Optional[str],
+        api_version: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=max_retries,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.acreate_file(  # type: ignore
+                create_file_data=create_file_data, openai_client=openai_client
+            )
+        response = openai_client.files.create(**create_file_data)
+        return response
+
+    async def afile_content(
+        self,
+        file_content_request: FileContentRequest,
+        openai_client: AsyncAzureOpenAI,
+    ) -> HttpxBinaryResponseContent:
+        response = await openai_client.files.content(**file_content_request)
+        return response
+
+    def file_content(
+        self,
+        _is_async: bool,
+        file_content_request: FileContentRequest,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ) -> Union[
+        HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]
+    ]:
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                api_version=api_version,
+                max_retries=max_retries,
+                organization=organization,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.afile_content(  # type: ignore
+                file_content_request=file_content_request,
+                openai_client=openai_client,
+            )
+        response = openai_client.files.content(**file_content_request)
+
+        return response
+
+    async def aretrieve_file(
+        self,
+        file_id: str,
+        openai_client: AsyncAzureOpenAI,
+    ) -> FileObject:
+        response = await openai_client.files.retrieve(file_id=file_id)
+        return response
+
+    def retrieve_file(
+        self,
+        _is_async: bool,
+        file_id: str,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                api_version=api_version,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.aretrieve_file(  # type: ignore
+                file_id=file_id,
+                openai_client=openai_client,
+            )
+        response = openai_client.files.retrieve(file_id=file_id)
+
+        return response
+
+    async def adelete_file(
+        self,
+        file_id: str,
+        openai_client: AsyncAzureOpenAI,
+    ) -> FileDeleted:
+        response = await openai_client.files.delete(file_id=file_id)
+        return response
+
+    def delete_file(
+        self,
+        _is_async: bool,
+        file_id: str,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str] = None,
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                api_version=api_version,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.adelete_file(  # type: ignore
+                file_id=file_id,
+                openai_client=openai_client,
+            )
+        response = openai_client.files.delete(file_id=file_id)
+
+        return response
+
+    async def alist_files(
+        self,
+        openai_client: AsyncAzureOpenAI,
+        purpose: Optional[str] = None,
+    ):
+        if isinstance(purpose, str):
+            response = await openai_client.files.list(purpose=purpose)
+        else:
+            response = await openai_client.files.list()
+        return response
+
+    def list_files(
+        self,
+        _is_async: bool,
+        api_base: str,
+        api_key: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        purpose: Optional[str] = None,
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                api_version=api_version,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.alist_files(  # type: ignore
+                purpose=purpose,
+                openai_client=openai_client,
+            )
+
+        if isinstance(purpose, str):
+            response = openai_client.files.list(purpose=purpose)
+        else:
+            response = openai_client.files.list()
+
+        return response
--- a/litellm/llms/fine_tuning_apis/azure.py
+++ b/litellm/llms/fine_tuning_apis/azure.py
@ -0,0 +1,181 @@
+from typing import Any, Coroutine, Optional, Union
+
+import httpx
+from openai import AsyncAzureOpenAI, AzureOpenAI
+from openai.pagination import AsyncCursorPage
+from openai.types.fine_tuning import FineTuningJob
+
+from litellm._logging import verbose_logger
+from litellm.llms.base import BaseLLM
+from litellm.llms.files_apis.azure import get_azure_openai_client
+from litellm.types.llms.openai import FineTuningJobCreate
+
+
+class AzureOpenAIFineTuningAPI(BaseLLM):
+    """
+    AzureOpenAI methods to support for batches
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    async def acreate_fine_tuning_job(
+        self,
+        create_fine_tuning_job_data: dict,
+        openai_client: AsyncAzureOpenAI,
+    ) -> FineTuningJob:
+        response = await openai_client.fine_tuning.jobs.create(
+            **create_fine_tuning_job_data  # type: ignore
+        )
+        return response
+
+    def create_fine_tuning_job(
+        self,
+        _is_async: bool,
+        create_fine_tuning_job_data: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        api_version: Optional[str] = None,
+    ) -> Union[FineTuningJob, Union[Coroutine[Any, Any, FineTuningJob]]]:
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                api_version=api_version,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.acreate_fine_tuning_job(  # type: ignore
+                create_fine_tuning_job_data=create_fine_tuning_job_data,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug(
+            "creating fine tuning job, args= %s", create_fine_tuning_job_data
+        )
+        response = openai_client.fine_tuning.jobs.create(**create_fine_tuning_job_data)  # type: ignore
+        return response
+
+    async def acancel_fine_tuning_job(
+        self,
+        fine_tuning_job_id: str,
+        openai_client: AsyncAzureOpenAI,
+    ) -> FineTuningJob:
+        response = await openai_client.fine_tuning.jobs.cancel(
+            fine_tuning_job_id=fine_tuning_job_id
+        )
+        return response
+
+    def cancel_fine_tuning_job(
+        self,
+        _is_async: bool,
+        fine_tuning_job_id: str,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str] = None,
+        api_version: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.acancel_fine_tuning_job(  # type: ignore
+                fine_tuning_job_id=fine_tuning_job_id,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug("canceling fine tuning job, args= %s", fine_tuning_job_id)
+        response = openai_client.fine_tuning.jobs.cancel(
+            fine_tuning_job_id=fine_tuning_job_id
+        )
+        return response
+
+    async def alist_fine_tuning_jobs(
+        self,
+        openai_client: AsyncAzureOpenAI,
+        after: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        response = await openai_client.fine_tuning.jobs.list(after=after, limit=limit)  # type: ignore
+        return response
+
+    def list_fine_tuning_jobs(
+        self,
+        _is_async: bool,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str] = None,
+        client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None,
+        api_version: Optional[str] = None,
+        after: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = (
+            get_azure_openai_client(
+                api_key=api_key,
+                api_base=api_base,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=max_retries,
+                organization=organization,
+                client=client,
+                _is_async=_is_async,
+            )
+        )
+        if openai_client is None:
+            raise ValueError(
+                "AzureOpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncAzureOpenAI):
+                raise ValueError(
+                    "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client."
+                )
+            return self.alist_fine_tuning_jobs(  # type: ignore
+                after=after,
+                limit=limit,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug("list fine tuning job, after= %s, limit= %s", after, limit)
+        response = openai_client.fine_tuning.jobs.list(after=after, limit=limit)  # type: ignore
+        return response
--- a/litellm/llms/fine_tuning_apis/openai.py
+++ b/litellm/llms/fine_tuning_apis/openai.py
@ -0,0 +1,199 @@
+from typing import Any, Coroutine, Optional, Union
+
+import httpx
+from openai import AsyncOpenAI, OpenAI
+from openai.pagination import AsyncCursorPage
+from openai.types.fine_tuning import FineTuningJob
+
+from litellm._logging import verbose_logger
+from litellm.llms.base import BaseLLM
+from litellm.types.llms.openai import FineTuningJobCreate
+
+
+class OpenAIFineTuningAPI(BaseLLM):
+    """
+    OpenAI methods to support for batches
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def get_openai_client(
+        self,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+        _is_async: bool = False,
+    ) -> Optional[Union[OpenAI, AsyncOpenAI]]:
+        received_args = locals()
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = None
+        if client is None:
+            data = {}
+            for k, v in received_args.items():
+                if k == "self" or k == "client" or k == "_is_async":
+                    pass
+                elif k == "api_base" and v is not None:
+                    data["base_url"] = v
+                elif v is not None:
+                    data[k] = v
+            if _is_async is True:
+                openai_client = AsyncOpenAI(**data)
+            else:
+                openai_client = OpenAI(**data)  # type: ignore
+        else:
+            openai_client = client
+
+        return openai_client
+
+    async def acreate_fine_tuning_job(
+        self,
+        create_fine_tuning_job_data: dict,
+        openai_client: AsyncOpenAI,
+    ) -> FineTuningJob:
+        response = await openai_client.fine_tuning.jobs.create(
+            **create_fine_tuning_job_data
+        )
+        return response
+
+    def create_fine_tuning_job(
+        self,
+        _is_async: bool,
+        create_fine_tuning_job_data: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+    ) -> Union[FineTuningJob, Union[Coroutine[Any, Any, FineTuningJob]]]:
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.acreate_fine_tuning_job(  # type: ignore
+                create_fine_tuning_job_data=create_fine_tuning_job_data,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug(
+            "creating fine tuning job, args= %s", create_fine_tuning_job_data
+        )
+        response = openai_client.fine_tuning.jobs.create(**create_fine_tuning_job_data)
+        return response
+
+    async def acancel_fine_tuning_job(
+        self,
+        fine_tuning_job_id: str,
+        openai_client: AsyncOpenAI,
+    ) -> FineTuningJob:
+        response = await openai_client.fine_tuning.jobs.cancel(
+            fine_tuning_job_id=fine_tuning_job_id
+        )
+        return response
+
+    def cancel_fine_tuning_job(
+        self,
+        _is_async: bool,
+        fine_tuning_job_id: str,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+    ):
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.acancel_fine_tuning_job(  # type: ignore
+                fine_tuning_job_id=fine_tuning_job_id,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug("canceling fine tuning job, args= %s", fine_tuning_job_id)
+        response = openai_client.fine_tuning.jobs.cancel(
+            fine_tuning_job_id=fine_tuning_job_id
+        )
+        return response
+
+    async def alist_fine_tuning_jobs(
+        self,
+        openai_client: AsyncOpenAI,
+        after: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        response = await openai_client.fine_tuning.jobs.list(after=after, limit=limit)  # type: ignore
+        return response
+
+    def list_fine_tuning_jobs(
+        self,
+        _is_async: bool,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        max_retries: Optional[int],
+        organization: Optional[str],
+        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
+        after: Optional[str] = None,
+        limit: Optional[int] = None,
+    ):
+        openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
+            api_key=api_key,
+            api_base=api_base,
+            timeout=timeout,
+            max_retries=max_retries,
+            organization=organization,
+            client=client,
+            _is_async=_is_async,
+        )
+        if openai_client is None:
+            raise ValueError(
+                "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
+            )
+
+        if _is_async is True:
+            if not isinstance(openai_client, AsyncOpenAI):
+                raise ValueError(
+                    "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
+                )
+            return self.alist_fine_tuning_jobs(  # type: ignore
+                after=after,
+                limit=limit,
+                openai_client=openai_client,
+            )
+        verbose_logger.debug("list fine tuning job, after= %s, limit= %s", after, limit)
+        response = openai_client.fine_tuning.jobs.list(after=after, limit=limit)  # type: ignore
+        return response
+        pass
--- a/litellm/llms/fine_tuning_apis/vertex_ai.py
+++ b/litellm/llms/fine_tuning_apis/vertex_ai.py
@ -0,0 +1,298 @@
+import traceback
+from datetime import datetime
+from typing import Any, Coroutine, Literal, Optional, Union
+
+import httpx
+from openai.types.fine_tuning.fine_tuning_job import FineTuningJob, Hyperparameters
+
+from litellm._logging import verbose_logger
+from litellm.llms.base import BaseLLM
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.llms.vertex_httpx import VertexLLM
+from litellm.types.llms.openai import FineTuningJobCreate
+from litellm.types.llms.vertex_ai import (
+    FineTuneJobCreate,
+    FineTunesupervisedTuningSpec,
+    ResponseTuningJob,
+)
+
+
+class VertexFineTuningAPI(VertexLLM):
+    """
+    Vertex methods to support for batches
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.async_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+
+    def convert_response_created_at(self, response: ResponseTuningJob):
+        try:
+
+            create_time_str = response.get("createTime", "") or ""
+            create_time_datetime = datetime.fromisoformat(
+                create_time_str.replace("Z", "+00:00")
+            )
+            # Convert to Unix timestamp (seconds since epoch)
+            created_at = int(create_time_datetime.timestamp())
+
+            return created_at
+        except Exception as e:
+            return 0
+
+    def convert_vertex_response_to_open_ai_response(
+        self, response: ResponseTuningJob
+    ) -> FineTuningJob:
+        status: Literal[
+            "validating_files", "queued", "running", "succeeded", "failed", "cancelled"
+        ] = "queued"
+        if response["state"] == "JOB_STATE_PENDING":
+            status = "queued"
+        if response["state"] == "JOB_STATE_SUCCEEDED":
+            status = "succeeded"
+        if response["state"] == "JOB_STATE_FAILED":
+            status = "failed"
+        if response["state"] == "JOB_STATE_CANCELLED":
+            status = "cancelled"
+        if response["state"] == "JOB_STATE_RUNNING":
+            status = "running"
+
+        created_at = self.convert_response_created_at(response)
+
+        training_uri = ""
+        if "supervisedTuningSpec" in response and response["supervisedTuningSpec"]:
+            training_uri = response["supervisedTuningSpec"]["trainingDatasetUri"] or ""
+
+        return FineTuningJob(
+            id=response["name"] or "",
+            created_at=created_at,
+            fine_tuned_model=response["tunedModelDisplayName"],
+            finished_at=None,
+            hyperparameters=Hyperparameters(
+                n_epochs=0,
+            ),
+            model=response["baseModel"] or "",
+            object="fine_tuning.job",
+            organization_id="",
+            result_files=[],
+            seed=0,
+            status=status,
+            trained_tokens=None,
+            training_file=training_uri,
+            validation_file=None,
+            estimated_finish=None,
+            integrations=[],
+        )
+
+    def convert_openai_request_to_vertex(
+        self, create_fine_tuning_job_data: FineTuningJobCreate, **kwargs
+    ) -> FineTuneJobCreate:
+        """
+        convert request from OpenAI format to Vertex format
+        https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/tuning
+        supervised_tuning_spec = FineTunesupervisedTuningSpec(
+        """
+        hyperparameters = create_fine_tuning_job_data.hyperparameters
+        supervised_tuning_spec = FineTunesupervisedTuningSpec(
+            training_dataset_uri=create_fine_tuning_job_data.training_file,
+            validation_dataset=create_fine_tuning_job_data.validation_file,
+        )
+
+        if hyperparameters:
+            if hyperparameters.n_epochs:
+                supervised_tuning_spec["epoch_count"] = int(hyperparameters.n_epochs)
+            if hyperparameters.learning_rate_multiplier:
+                supervised_tuning_spec["learning_rate_multiplier"] = float(
+                    hyperparameters.learning_rate_multiplier
+                )
+
+        supervised_tuning_spec["adapter_size"] = kwargs.get("adapter_size")
+
+        fine_tune_job = FineTuneJobCreate(
+            baseModel=create_fine_tuning_job_data.model,
+            supervisedTuningSpec=supervised_tuning_spec,
+            tunedModelDisplayName=create_fine_tuning_job_data.suffix,
+        )
+
+        return fine_tune_job
+
+    async def acreate_fine_tuning_job(
+        self,
+        fine_tuning_url: str,
+        headers: dict,
+        request_data: FineTuneJobCreate,
+    ):
+        from litellm.fine_tuning.main import FineTuningJob
+
+        try:
+            verbose_logger.debug(
+                "about to create fine tuning job: %s, request_data: %s",
+                fine_tuning_url,
+                request_data,
+            )
+            if self.async_handler is None:
+                raise ValueError(
+                    "VertexAI Fine Tuning - async_handler is not initialized"
+                )
+            response = await self.async_handler.post(
+                headers=headers,
+                url=fine_tuning_url,
+                json=request_data,  # type: ignore
+            )
+
+            if response.status_code != 200:
+                raise Exception(
+                    f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
+                )
+
+            verbose_logger.debug(
+                "got response from creating fine tuning job: %s", response.json()
+            )
+
+            vertex_response = ResponseTuningJob(  # type: ignore
+                **response.json(),
+            )
+
+            verbose_logger.debug("vertex_response %s", vertex_response)
+            open_ai_response = self.convert_vertex_response_to_open_ai_response(
+                vertex_response
+            )
+            return open_ai_response
+
+        except Exception as e:
+            verbose_logger.error("asyncerror creating fine tuning job %s", e)
+            trace_back_str = traceback.format_exc()
+            verbose_logger.error(trace_back_str)
+            raise e
+
+    def create_fine_tuning_job(
+        self,
+        _is_async: bool,
+        create_fine_tuning_job_data: FineTuningJobCreate,
+        vertex_project: Optional[str],
+        vertex_location: Optional[str],
+        vertex_credentials: Optional[str],
+        api_base: Optional[str],
+        timeout: Union[float, httpx.Timeout],
+        **kwargs,
+    ):
+
+        verbose_logger.debug(
+            "creating fine tuning job, args= %s", create_fine_tuning_job_data
+        )
+
+        auth_header, _ = self._get_token_and_url(
+            model="",
+            gemini_api_key=None,
+            vertex_credentials=vertex_credentials,
+            vertex_project=vertex_project,
+            vertex_location=vertex_location,
+            stream=False,
+            custom_llm_provider="vertex_ai_beta",
+            api_base=api_base,
+        )
+
+        headers = {
+            "Authorization": f"Bearer {auth_header}",
+            "Content-Type": "application/json",
+        }
+
+        fine_tune_job = self.convert_openai_request_to_vertex(
+            create_fine_tuning_job_data=create_fine_tuning_job_data, **kwargs
+        )
+
+        fine_tuning_url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs"
+        if _is_async is True:
+            return self.acreate_fine_tuning_job(  # type: ignore
+                fine_tuning_url=fine_tuning_url,
+                headers=headers,
+                request_data=fine_tune_job,
+            )
+        sync_handler = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+
+        verbose_logger.debug(
+            "about to create fine tuning job: %s, request_data: %s",
+            fine_tuning_url,
+            fine_tune_job,
+        )
+        response = sync_handler.post(
+            headers=headers,
+            url=fine_tuning_url,
+            json=fine_tune_job,  # type: ignore
+        )
+
+        if response.status_code != 200:
+            raise Exception(
+                f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
+            )
+
+        verbose_logger.debug(
+            "got response from creating fine tuning job: %s", response.json()
+        )
+        vertex_response = ResponseTuningJob(  # type: ignore
+            **response.json(),
+        )
+
+        verbose_logger.debug("vertex_response %s", vertex_response)
+        open_ai_response = self.convert_vertex_response_to_open_ai_response(
+            vertex_response
+        )
+        return open_ai_response
+
+    async def pass_through_vertex_ai_POST_request(
+        self,
+        request_data: dict,
+        vertex_project: str,
+        vertex_location: str,
+        vertex_credentials: str,
+        request_route: str,
+    ):
+        auth_header, _ = self._get_token_and_url(
+            model="",
+            gemini_api_key=None,
+            vertex_credentials=vertex_credentials,
+            vertex_project=vertex_project,
+            vertex_location=vertex_location,
+            stream=False,
+            custom_llm_provider="vertex_ai_beta",
+            api_base="",
+        )
+
+        headers = {
+            "Authorization": f"Bearer {auth_header}",
+            "Content-Type": "application/json",
+        }
+
+        url = None
+        if request_route == "/tuningJobs":
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs"
+        elif "/tuningJobs/" in request_route and "cancel" in request_route:
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/tuningJobs{request_route}"
+        elif "generateContent" in request_route:
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
+        elif "predict" in request_route:
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
+        elif "/batchPredictionJobs" in request_route:
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
+        elif "countTokens" in request_route:
+            url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}{request_route}"
+        else:
+            raise ValueError(f"Unsupported Vertex AI request route: {request_route}")
+        if self.async_handler is None:
+            raise ValueError("VertexAI Fine Tuning - async_handler is not initialized")
+
+        response = await self.async_handler.post(
+            headers=headers,
+            url=url,
+            json=request_data,  # type: ignore
+        )
+
+        if response.status_code != 200:
+            raise Exception(
+                f"Error creating fine tuning job. Status code: {response.status_code}. Response: {response.text}"
+            )
+
+        response_json = response.json()
+        return response_json
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -6,12 +6,13 @@ import os
 import time
 import types
 from enum import Enum
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, get_args

 import httpx
 import requests

 import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.completion import ChatCompletionMessageToolCallParam
 from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage

@ -60,6 +61,10 @@ hf_tasks = Literal[
    "text-generation",
 ]

+hf_tasks_embeddings = Literal[  # pipeline tags + hf tei endpoints - https://huggingface.github.io/text-embeddings-inference/#/
+    "sentence-similarity", "feature-extraction", "rerank", "embed", "similarity"
+]
+

 class HuggingfaceConfig:
    """
@ -249,6 +254,55 @@ def get_hf_task_for_model(model: str) -> Tuple[hf_tasks, str]:
        return "text-generation-inference", model  # default to tgi


+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+
+
+def get_hf_task_embedding_for_model(
+    model: str, task_type: Optional[str], api_base: str
+) -> Optional[str]:
+    if task_type is not None:
+        if task_type in get_args(hf_tasks_embeddings):
+            return task_type
+        else:
+            raise Exception(
+                "Invalid task_type={}. Expected one of={}".format(
+                    task_type, hf_tasks_embeddings
+                )
+            )
+    http_client = HTTPHandler(concurrent_limit=1)
+
+    model_info = http_client.get(url=api_base)
+
+    model_info_dict = model_info.json()
+
+    pipeline_tag: Optional[str] = model_info_dict.get("pipeline_tag", None)
+
+    return pipeline_tag
+
+
+async def async_get_hf_task_embedding_for_model(
+    model: str, task_type: Optional[str], api_base: str
+) -> Optional[str]:
+    if task_type is not None:
+        if task_type in get_args(hf_tasks_embeddings):
+            return task_type
+        else:
+            raise Exception(
+                "Invalid task_type={}. Expected one of={}".format(
+                    task_type, hf_tasks_embeddings
+                )
+            )
+    http_client = AsyncHTTPHandler(concurrent_limit=1)
+
+    model_info = await http_client.get(url=api_base)
+
+    model_info_dict = model_info.json()
+
+    pipeline_tag: Optional[str] = model_info_dict.get("pipeline_tag", None)
+
+    return pipeline_tag
+
+
 class Huggingface(BaseLLM):
    _client_session: Optional[httpx.Client] = None
    _aclient_session: Optional[httpx.AsyncClient] = None
@ -256,7 +310,7 @@ class Huggingface(BaseLLM):
    def __init__(self) -> None:
        super().__init__()

-    def validate_environment(self, api_key, headers):
+    def _validate_environment(self, api_key, headers) -> dict:
        default_headers = {
            "content-type": "application/json",
        }
@ -406,7 +460,7 @@ class Huggingface(BaseLLM):
        super().completion()
        exception_mapping_worked = False
        try:
-            headers = self.validate_environment(api_key, headers)
+            headers = self._validate_environment(api_key, headers)
            task, model = get_hf_task_for_model(model)
            ## VALIDATE API FORMAT
            if task is None or not isinstance(task, str) or task not in hf_task_list:
@ -762,76 +816,82 @@ class Huggingface(BaseLLM):
                async for transformed_chunk in streamwrapper:
                    yield transformed_chunk

-    def embedding(
-        self,
-        model: str,
-        input: list,
-        model_response: litellm.EmbeddingResponse,
-        api_key: Optional[str] = None,
-        api_base: Optional[str] = None,
-        logging_obj=None,
-        encoding=None,
-    ):
-        super().embedding()
-        headers = self.validate_environment(api_key, headers=None)
-        # print_verbose(f"{model}, {task}")
-        embed_url = ""
-        if "https" in model:
-            embed_url = model
-        elif api_base:
-            embed_url = api_base
-        elif "HF_API_BASE" in os.environ:
-            embed_url = os.getenv("HF_API_BASE", "")
-        elif "HUGGINGFACE_API_BASE" in os.environ:
-            embed_url = os.getenv("HUGGINGFACE_API_BASE", "")
-        else:
-            embed_url = f"https://api-inference.huggingface.co/models/{model}"
+    def _transform_input_on_pipeline_tag(
+        self, input: List, pipeline_tag: Optional[str]
+    ) -> dict:
+        if pipeline_tag is None:
+            return {"inputs": input}
+        if pipeline_tag == "sentence-similarity" or pipeline_tag == "similarity":
+            if len(input) < 2:
+                raise HuggingfaceError(
+                    status_code=400,
+                    message="sentence-similarity requires 2+ sentences",
+                )
+            return {"inputs": {"source_sentence": input[0], "sentences": input[1:]}}
+        elif pipeline_tag == "rerank":
+            if len(input) < 2:
+                raise HuggingfaceError(
+                    status_code=400,
+                    message="reranker requires 2+ sentences",
+                )
+            return {"inputs": {"query": input[0], "texts": input[1:]}}
+        return {"inputs": input}  # default to feature-extraction pipeline tag

+    async def _async_transform_input(
+        self, model: str, task_type: Optional[str], embed_url: str, input: List
+    ) -> dict:
+        hf_task = await async_get_hf_task_embedding_for_model(
+            model=model, task_type=task_type, api_base=embed_url
+        )
+
+        data = self._transform_input_on_pipeline_tag(input=input, pipeline_tag=hf_task)
+
+        return data
+
+    def _transform_input(
+        self,
+        input: List,
+        model: str,
+        call_type: Literal["sync", "async"],
+        optional_params: dict,
+        embed_url: str,
+    ) -> dict:
+        ## TRANSFORMATION ##
        if "sentence-transformers" in model:
            if len(input) == 0:
                raise HuggingfaceError(
                    status_code=400,
                    message="sentence transformers requires 2+ sentences",
                )
-            data = {
-                "inputs": {
-                    "source_sentence": input[0],
-                    "sentences": [
-                        "That is a happy dog",
-                        "That is a very happy person",
-                        "Today is a sunny day",
-                    ],
-                }
-            }
+            data = {"inputs": {"source_sentence": input[0], "sentences": input[1:]}}
        else:
            data = {"inputs": input}  # type: ignore

-        ## LOGGING
-        logging_obj.pre_call(
-            input=input,
-            api_key=api_key,
-            additional_args={
-                "complete_input_dict": data,
-                "headers": headers,
-                "api_base": embed_url,
-            },
-        )
-        ## COMPLETION CALL
-        response = requests.post(embed_url, headers=headers, data=json.dumps(data))
+            task_type = optional_params.pop("input_type", None)

-        ## LOGGING
-        logging_obj.post_call(
-            input=input,
-            api_key=api_key,
-            additional_args={"complete_input_dict": data},
-            original_response=response,
-        )
+            if call_type == "sync":
+                hf_task = get_hf_task_embedding_for_model(
+                    model=model, task_type=task_type, api_base=embed_url
+                )
+            elif call_type == "async":
+                return self._async_transform_input(
+                    model=model, task_type=task_type, embed_url=embed_url, input=input
+                )  # type: ignore

-        embeddings = response.json()
+            data = self._transform_input_on_pipeline_tag(
+                input=input, pipeline_tag=hf_task
+            )

-        if "error" in embeddings:
-            raise HuggingfaceError(status_code=500, message=embeddings["error"])
+        return data

+    def _process_embedding_response(
+        self,
+        embeddings: dict,
+        model_response: litellm.EmbeddingResponse,
+        model: str,
+        input: List,
+        encoding: Any,
+    ) -> litellm.EmbeddingResponse:
        output_data = []
        if "similarities" in embeddings:
            for idx, embedding in embeddings["similarities"]:
@ -888,3 +948,156 @@ class Huggingface(BaseLLM):
            ),
        )
        return model_response
+
+    async def aembedding(
+        self,
+        model: str,
+        input: list,
+        model_response: litellm.utils.EmbeddingResponse,
+        timeout: Union[float, httpx.Timeout],
+        logging_obj: LiteLLMLoggingObj,
+        optional_params: dict,
+        api_base: str,
+        api_key: Optional[str],
+        headers: dict,
+        encoding: Callable,
+        client: Optional[AsyncHTTPHandler] = None,
+    ):
+        ## TRANSFORMATION ##
+        data = self._transform_input(
+            input=input,
+            model=model,
+            call_type="sync",
+            optional_params=optional_params,
+            embed_url=api_base,
+        )
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=input,
+            api_key=api_key,
+            additional_args={
+                "complete_input_dict": data,
+                "headers": headers,
+                "api_base": api_base,
+            },
+        )
+        ## COMPLETION CALL
+        if client is None:
+            client = AsyncHTTPHandler(concurrent_limit=1)
+
+        response = await client.post(api_base, headers=headers, data=json.dumps(data))
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=input,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=response,
+        )
+
+        embeddings = response.json()
+
+        if "error" in embeddings:
+            raise HuggingfaceError(status_code=500, message=embeddings["error"])
+
+        ## PROCESS RESPONSE ##
+        return self._process_embedding_response(
+            embeddings=embeddings,
+            model_response=model_response,
+            model=model,
+            input=input,
+            encoding=encoding,
+        )
+
+    def embedding(
+        self,
+        model: str,
+        input: list,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        logging_obj: LiteLLMLoggingObj,
+        encoding: Callable,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
+        aembedding: Optional[bool] = None,
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+    ) -> litellm.EmbeddingResponse:
+        super().embedding()
+        headers = self._validate_environment(api_key, headers=None)
+        # print_verbose(f"{model}, {task}")
+        embed_url = ""
+        if "https" in model:
+            embed_url = model
+        elif api_base:
+            embed_url = api_base
+        elif "HF_API_BASE" in os.environ:
+            embed_url = os.getenv("HF_API_BASE", "")
+        elif "HUGGINGFACE_API_BASE" in os.environ:
+            embed_url = os.getenv("HUGGINGFACE_API_BASE", "")
+        else:
+            embed_url = f"https://api-inference.huggingface.co/models/{model}"
+
+        ## ROUTING ##
+        if aembedding is True:
+            return self.aembedding(
+                input=input,
+                model_response=model_response,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                headers=headers,
+                api_base=embed_url,  # type: ignore
+                api_key=api_key,
+                client=client if isinstance(client, AsyncHTTPHandler) else None,
+                model=model,
+                optional_params=optional_params,
+                encoding=encoding,
+            )
+
+        ## TRANSFORMATION ##
+
+        data = self._transform_input(
+            input=input,
+            model=model,
+            call_type="sync",
+            optional_params=optional_params,
+            embed_url=embed_url,
+        )
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=input,
+            api_key=api_key,
+            additional_args={
+                "complete_input_dict": data,
+                "headers": headers,
+                "api_base": embed_url,
+            },
+        )
+        ## COMPLETION CALL
+        if client is None or not isinstance(client, HTTPHandler):
+            client = HTTPHandler(concurrent_limit=1)
+        response = client.post(embed_url, headers=headers, data=json.dumps(data))
+
+        ## LOGGING
+        logging_obj.post_call(
+            input=input,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=response,
+        )
+
+        embeddings = response.json()
+
+        if "error" in embeddings:
+            raise HuggingfaceError(status_code=500, message=embeddings["error"])
+
+        ## PROCESS RESPONSE ##
+        return self._process_embedding_response(
+            embeddings=embeddings,
+            model_response=model_response,
+            model=model,
+            input=input,
+            encoding=encoding,
+        )
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -258,7 +258,7 @@ def get_ollama_response(
                logging_obj=logging_obj,
            )
        return response
-    elif stream == True:
+    elif stream is True:
        return ollama_completion_stream(url=url, data=data, logging_obj=logging_obj)

    response = requests.post(
@ -326,7 +326,7 @@ def ollama_completion_stream(url, data, logging_obj):
        try:
            if response.status_code != 200:
                raise OllamaError(
-                    status_code=response.status_code, message=response.text
+                    status_code=response.status_code, message=response.read()
                )

            streamwrapper = litellm.CustomStreamWrapper(
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -149,7 +149,9 @@ class OllamaChatConfig:
            "response_format",
        ]

-    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+    def map_openai_params(
+        self, model: str, non_default_params: dict, optional_params: dict
+    ):
        for param, value in non_default_params.items():
            if param == "max_tokens":
                optional_params["num_predict"] = value
@ -170,16 +172,26 @@ class OllamaChatConfig:
            ### FUNCTION CALLING LOGIC ###
            if param == "tools":
                # ollama actually supports json output
-                optional_params["format"] = "json"
-                litellm.add_function_to_prompt = (
-                    True  # so that main.py adds the function call to the prompt
-                )
-                optional_params["functions_unsupported_model"] = value
+                ## CHECK IF MODEL SUPPORTS TOOL CALLING ##
+                try:
+                    model_info = litellm.get_model_info(
+                        model=model, custom_llm_provider="ollama_chat"
+                    )
+                    if model_info.get("supports_function_calling") is True:
+                        optional_params["tools"] = value
+                    else:
+                        raise Exception
+                except Exception:
+                    optional_params["format"] = "json"
+                    litellm.add_function_to_prompt = (
+                        True  # so that main.py adds the function call to the prompt
+                    )
+                    optional_params["functions_unsupported_model"] = value

-                if len(optional_params["functions_unsupported_model"]) == 1:
-                    optional_params["function_name"] = optional_params[
-                        "functions_unsupported_model"
-                    ][0]["function"]["name"]
+                    if len(optional_params["functions_unsupported_model"]) == 1:
+                        optional_params["function_name"] = optional_params[
+                            "functions_unsupported_model"
+                        ][0]["function"]["name"]

            if param == "functions":
                # ollama actually supports json output
@ -198,11 +210,11 @@ class OllamaChatConfig:
 # ollama implementation
 def get_ollama_response(
    model_response: litellm.ModelResponse,
+    messages: list,
+    optional_params: dict,
    api_base="http://localhost:11434",
    api_key: Optional[str] = None,
    model="llama2",
-    messages=None,
-    optional_params=None,
    logging_obj=None,
    acompletion: bool = False,
    encoding=None,
@ -223,6 +235,7 @@ def get_ollama_response(
    stream = optional_params.pop("stream", False)
    format = optional_params.pop("format", None)
    function_name = optional_params.pop("function_name", None)
+    tools = optional_params.pop("tools", None)

    for m in messages:
        if "role" in m and m["role"] == "tool":
@ -236,6 +249,8 @@ def get_ollama_response(
    }
    if format is not None:
        data["format"] = format
+    if tools is not None:
+        data["tools"] = tools
    ## LOGGING
    logging_obj.pre_call(
        input=None,
@ -278,7 +293,7 @@ def get_ollama_response(
        "json": data,
    }
    if api_key is not None:
-        _request["headers"] = "Bearer {}".format(api_key)
+        _request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
    response = requests.post(**_request)  # type: ignore
    if response.status_code != 200:
        raise OllamaError(status_code=response.status_code, message=response.text)
@ -343,7 +358,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
        "timeout": litellm.request_timeout,
    }
    if api_key is not None:
-        _request["headers"] = "Bearer {}".format(api_key)
+        _request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
    with httpx.stream(**_request) as response:
        try:
            if response.status_code != 200:
@ -405,7 +420,7 @@ async def ollama_async_streaming(
            "timeout": litellm.request_timeout,
        }
        if api_key is not None:
-            _request["headers"] = "Bearer {}".format(api_key)
+            _request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
        async with client.stream(**_request) as response:
            if response.status_code != 200:
                raise OllamaError(
@ -477,7 +492,7 @@ async def ollama_acompletion(
                "json": data,
            }
            if api_key is not None:
-                _request["headers"] = "Bearer {}".format(api_key)
+                _request["headers"] = {"Authorization": "Bearer {}".format(api_key)}
            resp = await session.post(**_request)

            if resp.status != 200:
@ -499,7 +514,8 @@ async def ollama_acompletion(

            ## RESPONSE OBJECT
            model_response.choices[0].finish_reason = "stop"
-            if data.get("format", "") == "json":
+
+            if data.get("format", "") == "json" and function_name is not None:
                function_call = json.loads(response_json["message"]["content"])
                message = litellm.Message(
                    content=None,
@ -519,11 +535,8 @@ async def ollama_acompletion(
                model_response.choices[0].message = message  # type: ignore
                model_response.choices[0].finish_reason = "tool_calls"
            else:
-                model_response.choices[0].message.content = response_json[  # type: ignore
-                    "message"
-                ][
-                    "content"
-                ]
+                _message = litellm.Message(**response_json["message"])
+                model_response.choices[0].message = _message  # type: ignore

            model_response.created = int(time.time())
            model_response.model = "ollama_chat/" + data["model"]
--- a/Show more
+++ b/Show more