From ee82cfba52d4b84a2e0d0b5a108693a913b27ec4 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 16:34:33 -0800
Subject: [PATCH 001/218] fix(proxy/utils.py): if langfuse trace id passed in,
 just send that as part of alert

---
 litellm/proxy/utils.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 84b09d7265..20d958bae0 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -149,12 +149,20 @@ class ProxyLogging:
         if request_data is not None:
             model = request_data.get("model", "")
             messages = request_data.get("messages", "")
-            # try casting messages to str and get the first 100 characters, else mark as None
-            try:
+            trace_id = request_data.get("metadata", {}).get(
+                "trace_id", None
+            )  # get langfuse trace id
+            if trace_id is not None:
                 messages = str(messages)
-                messages = messages[:10000]
-            except:
-                messages = None
+                messages = messages[:100]
+                messages = f"{messages}\nLangfuse Trace Id: {trace_id}"
+            else:
+                # try casting messages to str and get the first 100 characters, else mark as None
+                try:
+                    messages = str(messages)
+                    messages = messages[:10000]
+                except:
+                    messages = None
 
             request_info = f"\nRequest Model: {model}\nMessages: {messages}"
         else:

From 98b397899855b29186fe8a042e58906d6c1d0323 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 19:28:57 -0800
Subject: [PATCH 002/218] feat(ui): enable admin to view all valid keys created
 on the proxy

---
 litellm/proxy/proxy_server.py                 |  68 ++++++-
 litellm/proxy/utils.py                        |  17 +-
 .../src/components/navbar.tsx                 |  72 ++++----
 .../src/components/networking.tsx             |  82 ++++-----
 .../src/components/user_dashboard.tsx         |  96 +++++-----
 .../src/components/view_key_spend_report.tsx  | 173 +++++++++++-------
 .../src/components/view_key_table.tsx         |  13 +-
 7 files changed, 312 insertions(+), 209 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 289a36cb2b..0fe6997eec 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -322,6 +322,7 @@ async def user_api_key_auth(
                 f"Malformed API Key passed in. Ensure Key has `Bearer ` prefix. Passed in: {passed_in_key}"
             )
 
+        ### CHECK IF ADMIN ###
         # note: never string compare api keys, this is vulenerable to a time attack. Use secrets.compare_digest instead
         is_master_key_valid = secrets.compare_digest(api_key, master_key)
         if is_master_key_valid:
@@ -454,6 +455,12 @@ async def user_api_key_auth(
                             if _user is None:
                                 continue
                             assert isinstance(_user, dict)
+                            # check if user is admin #
+                            if (
+                                _user.get("user_role", None) is not None
+                                and _user.get("user_role") == "proxy_admin"
+                            ):
+                                return UserAPIKeyAuth(api_key=master_key)
                             # Token exists, not expired now check if its in budget for the user
                             user_max_budget = _user.get("max_budget", None)
                             user_current_spend = _user.get("spend", None)
@@ -597,10 +604,13 @@ async def user_api_key_auth(
                     # check if user can access this route
                     query_params = request.query_params
                     user_id = query_params.get("user_id")
+                    verbose_proxy_logger.debug(
+                        f"user_id: {user_id} & valid_token.user_id: {valid_token.user_id}"
+                    )
                     if user_id != valid_token.user_id:
                         raise HTTPException(
                             status_code=status.HTTP_403_FORBIDDEN,
-                            detail="user not allowed to access this key's info",
+                            detail="key not allowed to access this user's info",
                         )
                 elif route == "/user/update":
                     raise HTTPException(
@@ -1846,6 +1856,9 @@ async def startup_event():
 
     if prisma_client is not None and master_key is not None:
         # add master key to db
+        user_id = "default_user_id"
+        if os.getenv("PROXY_ADMIN_ID", None) is not None:
+            user_id = os.getenv("PROXY_ADMIN_ID")
         asyncio.create_task(
             generate_key_helper_fn(
                 duration=None,
@@ -1854,7 +1867,8 @@ async def startup_event():
                 config={},
                 spend=0,
                 token=master_key,
-                user_id="default_user_id",
+                user_id=user_id,
+                user_role="proxy_admin",
             )
         )
 
@@ -3380,12 +3394,13 @@ async def auth_callback(request: Request):
         result = await microsoft_sso.verify_and_process(request)
 
     # User is Authe'd in - generate key for the UI to access Proxy
-    user_id = getattr(result, "email", None)
+    user_email = getattr(result, "email", None)
+    user_id = getattr(result, "id", None)
     if user_id is None:
         user_id = getattr(result, "first_name", "") + getattr(result, "last_name", "")
 
     response = await generate_key_helper_fn(
-        **{"duration": "1hr", "key_max_budget": 0, "models": [], "aliases": {}, "config": {}, "spend": 0, "user_id": user_id, "team_id": "litellm-dashboard"}  # type: ignore
+        **{"duration": "1hr", "key_max_budget": 0, "models": [], "aliases": {}, "config": {}, "spend": 0, "user_id": user_id, "team_id": "litellm-dashboard", "user_email": user_email}  # type: ignore
     )
 
     key = response["token"]  # type: ignore
@@ -3393,10 +3408,25 @@ async def auth_callback(request: Request):
 
     litellm_dashboard_ui = "/ui/"
 
+    user_role = "app_owner"
+    if (
+        os.getenv("PROXY_ADMIN_ID", None) is not None
+        and os.environ["PROXY_ADMIN_ID"] == user_id
+    ):
+        # checks if user is admin
+        user_role = "app_admin"
+
     import jwt
 
     jwt_token = jwt.encode(
-        {"user_id": user_id, "key": key}, "secret", algorithm="HS256"
+        {
+            "user_id": user_id,
+            "key": key,
+            "user_email": user_email,
+            "user_role": user_role,
+        },
+        "secret",
+        algorithm="HS256",
     )
     litellm_dashboard_ui += "?userID=" + user_id + "&token=" + jwt_token
 
@@ -3409,10 +3439,18 @@ async def auth_callback(request: Request):
     "/user/info", tags=["user management"], dependencies=[Depends(user_api_key_auth)]
 )
 async def user_info(
-    user_id: str = fastapi.Query(..., description="User ID in the request parameters")
+    user_id: Optional[str] = fastapi.Query(
+        default=None, description="User ID in the request parameters"
+    )
 ):
     """
     Use this to get user information. (user row + all user key info)
+
+    Example request
+    ```
+    curl -X GET 'http://localhost:8000/user/info?user_id=krrish7%40berri.ai' \
+    --header 'Authorization: Bearer sk-1234'
+    ```
     """
     global prisma_client
     try:
@@ -3421,11 +3459,25 @@ async def user_info(
                 f"Database not connected. Connect a database to your proxy - https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys"
             )
         ## GET USER ROW ##
-        user_info = await prisma_client.get_data(user_id=user_id)
+        if user_id is not None:
+            user_info = await prisma_client.get_data(user_id=user_id)
+        else:
+            user_info = None
         ## GET ALL KEYS ##
         keys = await prisma_client.get_data(
-            user_id=user_id, table_name="key", query_type="find_all"
+            user_id=user_id,
+            table_name="key",
+            query_type="find_all",
+            expires=datetime.now(),
         )
+
+        if user_info is None:
+            ## make sure we still return a total spend ##
+            spend = 0
+            for k in keys:
+                spend += getattr(k, "spend", 0)
+            user_info = {"spend": spend}
+
         ## REMOVE HASHED TOKEN INFO before returning ##
         for key in keys:
             try:
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 84b09d7265..62cbc6b4be 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -559,9 +559,20 @@ class PrismaClient:
                     # The asterisk before `user_id_list` unpacks the list into separate arguments
                     response = await self.db.query_raw(sql_query)
                 elif query_type == "find_all":
-                    response = await self.db.litellm_usertable.find_many(  # type: ignore
-                        order={"spend": "desc"},
-                    )
+                    if expires is not None:
+                        response = await self.db.litellm_usertable.find_many(  # type: ignore
+                            order={"spend": "desc"},
+                            where={  # type:ignore
+                                "OR": [
+                                    {"expires": None},  # type:ignore
+                                    {"expires": {"gt": expires}},  # type:ignore
+                                ],
+                            },
+                        )
+                    else:
+                        response = await self.db.litellm_usertable.find_many(  # type: ignore
+                            order={"spend": "desc"},
+                        )
                 return response
             elif table_name == "spend":
                 verbose_proxy_logger.debug(
diff --git a/ui/litellm-dashboard/src/components/navbar.tsx b/ui/litellm-dashboard/src/components/navbar.tsx
index b7cb357304..946cfc4472 100644
--- a/ui/litellm-dashboard/src/components/navbar.tsx
+++ b/ui/litellm-dashboard/src/components/navbar.tsx
@@ -1,40 +1,50 @@
 "use client";
 
-import Link from 'next/link';
-import Image from 'next/image'
-import React, { useState } from 'react';
+import Link from "next/link";
+import Image from "next/image";
+import React, { useState } from "react";
 import { useSearchParams } from "next/navigation";
-import { Button, Text, Metric,Title, TextInput, Grid, Col, Card } from "@tremor/react";
+import {
+  Button,
+  Text,
+  Metric,
+  Title,
+  TextInput,
+  Grid,
+  Col,
+  Card,
+} from "@tremor/react";
 
 // Define the props type
 interface NavbarProps {
-    userID: string | null;
-    userRole: string | null;
+  userID: string | null;
+  userRole: string | null;
+  userEmail: string | null;
 }
-const Navbar: React.FC<NavbarProps> = ({ userID, userRole }) => {
-    console.log("User ID:", userID);
+const Navbar: React.FC<NavbarProps> = ({ userID, userRole, userEmail }) => {
+  console.log("User ID:", userID);
+  console.log("userEmail:", userEmail);
 
-    return (
-        <nav className="left-0 right-0 top-0 flex justify-between items-center h-12 mb-4">
-            <div className="text-left mx-4 my-2 absolute top-0 left-0">
-                <div className="flex flex-col items-center">
-                <Link href="/">
-                    <button className="text-gray-800 text-2xl px-4 py-1 rounded text-center">🚅 LiteLLM</button>
-                </Link>
-            </div>
-            </div>
-            <div className="text-right mx-4 my-2 absolute top-0 right-0">
-                <Button variant='secondary'>
-                {userID}
-                <p>
-                Role: {userRole}
-                </p>
-                
-                </Button>
-            </div>
-            
-        </nav>
-    )
-}
+  return (
+    <nav className="left-0 right-0 top-0 flex justify-between items-center h-12 mb-4">
+      <div className="text-left mx-4 my-2 absolute top-0 left-0">
+        <div className="flex flex-col items-center">
+          <Link href="/">
+            <button className="text-gray-800 text-2xl px-4 py-1 rounded text-center">
+              🚅 LiteLLM
+            </button>
+          </Link>
+        </div>
+      </div>
+      <div className="text-right mx-4 my-2 absolute top-0 right-0">
+        <Button variant="secondary">
+          {userEmail}
+          <p>Role: {userRole}</p>
+          <p>ID: {userID}</p>
+        </Button>
+      </div>
+    </nav>
+  );
+};
 
-export default Navbar;
\ No newline at end of file
+export default Navbar;
diff --git a/ui/litellm-dashboard/src/components/networking.tsx b/ui/litellm-dashboard/src/components/networking.tsx
index 4763e475e7..5b8e422864 100644
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@@ -1,24 +1,24 @@
 /**
  * Helper file for calls being made to proxy
  */
-import { message } from 'antd';
+import { message } from "antd";
 
-const proxyBaseUrl = null;
-// const proxyBaseUrl = "http://localhost:4000" // http://localhost:4000
+const isLocal = process.env.NODE_ENV === "development";
+const proxyBaseUrl = isLocal ? "http://localhost:4000" : null;
 
 export const keyCreateCall = async (
   accessToken: string,
   userID: string,
-  formValues: Record<string, any>, // Assuming formValues is an object
+  formValues: Record<string, any> // Assuming formValues is an object
 ) => {
   try {
-    console.log("Form Values in keyCreateCall:", formValues); // Log the form values before making the API call    
-    
+    console.log("Form Values in keyCreateCall:", formValues); // Log the form values before making the API call
+
     // check if formValues.description is not undefined, make it a string and add it to formValues.metadata
     if (formValues.description) {
       // add to formValues.metadata
       if (!formValues.metadata) {
-        formValues.metadata = {}
+        formValues.metadata = {};
       }
       // value needs to be in "", valid JSON
       formValues.metadata.description = formValues.description;
@@ -26,7 +26,7 @@ export const keyCreateCall = async (
       delete formValues.description;
       formValues.metadata = JSON.stringify(formValues.metadata);
     }
-    // if formValues.metadata is not undefined, make it a valid dict 
+    // if formValues.metadata is not undefined, make it a valid dict
     if (formValues.metadata) {
       console.log("formValues.metadata:", formValues.metadata);
       // if there's an exception JSON.parse, show it in the message
@@ -69,15 +69,11 @@ export const keyCreateCall = async (
   }
 };
 
-
-export const keyDeleteCall = async (
-  accessToken: String,
-  user_key: String
-) => {
+export const keyDeleteCall = async (accessToken: String, user_key: String) => {
   try {
     const url = proxyBaseUrl ? `${proxyBaseUrl}/key/delete` : `/key/delete`;
-    console.log("in keyDeleteCall:", user_key)
-    
+    console.log("in keyDeleteCall:", user_key);
+
     const response = await fetch(url, {
       method: "POST",
       headers: {
@@ -108,21 +104,22 @@ export const keyDeleteCall = async (
 
 export const userInfoCall = async (
   accessToken: String,
-  userID: String
+  userID: String,
+  userRole: String
 ) => {
   try {
-    const url = proxyBaseUrl ? `${proxyBaseUrl}/user/info` : `/user/info`;
-    console.log("in userInfoCall:", url)
-    const response = await fetch(
-      `${url}/?user_id=${userID}`,
-      {
-        method: "GET",
-        headers: {
-          Authorization: `Bearer ${accessToken}`,
-          "Content-Type": "application/json",
-        },
-      }
-    );
+    let url = proxyBaseUrl ? `${proxyBaseUrl}/user/info` : `/user/info`;
+    if (userRole == "App Owner") {
+      url = `${url}/?user_id=${userID}`;
+    }
+    message.info("Requesting user data");
+    const response = await fetch(url, {
+      method: "GET",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+    });
 
     if (!response.ok) {
       const errorData = await response.text();
@@ -131,7 +128,7 @@ export const userInfoCall = async (
     }
 
     const data = await response.json();
-    console.log(data);
+    message.info("Received user data");
     return data;
     // Handle success - you might want to update some state or UI based on the created key
   } catch (error) {
@@ -140,24 +137,17 @@ export const userInfoCall = async (
   }
 };
 
-
-export const keySpendLogsCall = async (
-  accessToken: String,
-  token: String
-) => {
+export const keySpendLogsCall = async (accessToken: String, token: String) => {
   try {
     const url = proxyBaseUrl ? `${proxyBaseUrl}/spend/logs` : `/spend/logs`;
-    console.log("in keySpendLogsCall:", url)
-    const response = await fetch(
-      `${url}/?api_key=${token}`,
-      {
-        method: "GET",
-        headers: {
-          Authorization: `Bearer ${accessToken}`,
-          "Content-Type": "application/json",
-        },
-      }
-    );
+    console.log("in keySpendLogsCall:", url);
+    const response = await fetch(`${url}/?api_key=${token}`, {
+      method: "GET",
+      headers: {
+        Authorization: `Bearer ${accessToken}`,
+        "Content-Type": "application/json",
+      },
+    });
     if (!response.ok) {
       const errorData = await response.text();
       message.error(errorData);
@@ -171,4 +161,4 @@ export const keySpendLogsCall = async (
     console.error("Failed to create key:", error);
     throw error;
   }
-}
+};
diff --git a/ui/litellm-dashboard/src/components/user_dashboard.tsx b/ui/litellm-dashboard/src/components/user_dashboard.tsx
index 951d0287bf..b1a06939b1 100644
--- a/ui/litellm-dashboard/src/components/user_dashboard.tsx
+++ b/ui/litellm-dashboard/src/components/user_dashboard.tsx
@@ -6,21 +6,25 @@ import CreateKey from "./create_key_button";
 import ViewKeyTable from "./view_key_table";
 import ViewUserSpend from "./view_user_spend";
 import EnterProxyUrl from "./enter_proxy_url";
+import { message } from "antd";
 import Navbar from "./navbar";
 import { useSearchParams } from "next/navigation";
 import { jwtDecode } from "jwt-decode";
 
-const proxyBaseUrl = null;
-// const proxyBaseUrl = "http://localhost:4000" // http://localhost:4000
+const isLocal = process.env.NODE_ENV === "development";
+console.log("isLocal:", isLocal);
+const proxyBaseUrl = isLocal ? "http://localhost:4000" : null;
 
 type UserSpendData = {
   spend: number;
   max_budget?: number | null;
-}
+};
 
 const UserDashboard = () => {
   const [data, setData] = useState<null | any[]>(null); // Keep the initialization of state here
-  const [userSpendData, setUserSpendData] = useState<UserSpendData | null>(null);
+  const [userSpendData, setUserSpendData] = useState<UserSpendData | null>(
+    null
+  );
 
   // Assuming useSearchParams() hook exists and works in your setup
   const searchParams = useSearchParams();
@@ -30,19 +34,19 @@ const UserDashboard = () => {
   const token = searchParams.get("token");
   const [accessToken, setAccessToken] = useState<string | null>(null);
   const [userRole, setUserRole] = useState<string | null>(null);
-
+  const [userEmail, setUserEmail] = useState<string | null>(null);
 
   function formatUserRole(userRole: string) {
     if (!userRole) {
       return "Undefined Role";
     }
-  
+    console.log(`Received user role: ${userRole}`);
     switch (userRole.toLowerCase()) {
       case "app_owner":
         return "App Owner";
       case "demo_app_owner":
-          return "AppOwner";
-      case "admin":
+        return "App Owner";
+      case "app_admin":
         return "Admin";
       case "app_user":
         return "App User";
@@ -53,7 +57,7 @@ const UserDashboard = () => {
 
   // Moved useEffect inside the component and used a condition to run fetch only if the params are available
   useEffect(() => {
-    if (token){
+    if (token) {
       const decoded = jwtDecode(token) as { [key: string]: any };
       if (decoded) {
         // cast decoded to dictionary
@@ -71,17 +75,19 @@ const UserDashboard = () => {
         } else {
           console.log("User role not defined");
         }
+
+        if (decoded.user_email) {
+          setUserEmail(decoded.user_email);
+        } else {
+          console.log(`User Email is not set ${decoded}`);
+        }
       }
     }
-    if (userID && accessToken  && !data) {
+    if (userID && accessToken && userRole && !data) {
       const fetchData = async () => {
         try {
-          const response = await userInfoCall(
-            accessToken,
-            userID
-          );
-          console.log("Response:", response);
-          setUserSpendData(response["user_info"])
+          const response = await userInfoCall(accessToken, userID, userRole);
+          setUserSpendData(response["user_info"]);
           setData(response["keys"]); // Assuming this is the correct path to your data
         } catch (error) {
           console.error("There was an error fetching the data", error);
@@ -93,53 +99,45 @@ const UserDashboard = () => {
   }, [userID, token, accessToken, data]);
 
   if (userID == null || token == null) {
-
-  
     // Now you can construct the full URL
-    const url = proxyBaseUrl ? `${proxyBaseUrl}/sso/key/generate` : `/sso/key/generate`;
+    const url = proxyBaseUrl
+      ? `${proxyBaseUrl}/sso/key/generate`
+      : `/sso/key/generate`;
     console.log("Full URL:", url);
     window.location.href = url;
 
     return null;
-  }
-  else if (accessToken == null) {
+  } else if (accessToken == null) {
     return null;
   }
 
   if (userRole == null) {
-    setUserRole("App Owner")
+    setUserRole("App Owner");
   }
-  
+
   return (
     <div>
-      <Navbar
-        userID={userID}
-        userRole={userRole}
-      />
+      <Navbar userID={userID} userRole={userRole} userEmail={userEmail} />
       <Grid numItems={1} className="gap-0 p-10 h-[75vh] w-full">
-      <Col numColSpan={1}>
-        <ViewUserSpend
-          userID={userID}
-          userSpendData={userSpendData}
-        />
-        <ViewKeyTable
-          userID={userID}
-          accessToken={accessToken}
-          data={data}
-          setData={setData}
-        />
-        <CreateKey
-          userID={userID}
-          userRole={userRole}
-          accessToken={accessToken}
-          data={data}
-          setData={setData}
-        />
-      </Col>
-    </Grid>
+        <Col numColSpan={1}>
+          <ViewUserSpend userID={userID} userSpendData={userSpendData} />
+          <ViewKeyTable
+            userID={userID}
+            accessToken={accessToken}
+            data={data}
+            setData={setData}
+          />
+          <CreateKey
+            userID={userID}
+            userRole={userRole}
+            accessToken={accessToken}
+            data={data}
+            setData={setData}
+          />
+        </Col>
+      </Grid>
     </div>
-
   );
 };
 
-export default UserDashboard;
\ No newline at end of file
+export default UserDashboard;
diff --git a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
index 40961325ec..e90401e5bf 100644
--- a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
+++ b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
@@ -1,8 +1,26 @@
 "use client";
 
 import React, { useState, useEffect } from "react";
-import { Button as Button2, Modal, Form, Input, InputNumber, Select, message } from "antd";
-import { Button, Text, Card, Table, BarChart, Title, Subtitle, BarList, Metric  } from "@tremor/react";
+import {
+  Button as Button2,
+  Modal,
+  Form,
+  Input,
+  InputNumber,
+  Select,
+  message,
+} from "antd";
+import {
+  Button,
+  Text,
+  Card,
+  Table,
+  BarChart,
+  Title,
+  Subtitle,
+  BarList,
+  Metric,
+} from "@tremor/react";
 import { keySpendLogsCall } from "./networking";
 
 interface ViewKeySpendReportProps {
@@ -14,18 +32,30 @@ interface ViewKeySpendReportProps {
 }
 
 type ResponseValueType = {
-    startTime: string; // Assuming startTime is a string, adjust it if it's of a different type
-    spend: number; // Assuming spend is a number, adjust it if it's of a different type
-    user: string; // Assuming user is a string, adjust it if it's of a different type
-  };
+  startTime: string; // Assuming startTime is a string, adjust it if it's of a different type
+  spend: number; // Assuming spend is a number, adjust it if it's of a different type
+  user: string; // Assuming user is a string, adjust it if it's of a different type
+};
 
-const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({ token, accessToken, keySpend, keyBudget, keyName }) => {
+const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({
+  token,
+  accessToken,
+  keySpend,
+  keyBudget,
+  keyName,
+}) => {
   const [isModalVisible, setIsModalVisible] = useState(false);
-  const [data, setData] = useState<{ day: string; spend: number; }[] | null>(null);
-  const [userData, setUserData] = useState<{ name: string; value: number; }[] | null>(null);
+  const [data, setData] = useState<{ day: string; spend: number }[] | null>(
+    null
+  );
+  const [userData, setUserData] = useState<
+    { name: string; value: number }[] | null
+  >(null);
 
   const showModal = () => {
+    console.log("Show Modal triggered");
     setIsModalVisible(true);
+    fetchData();
   };
 
   const handleOk = () => {
@@ -41,68 +71,79 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({ token, accessTo
     try {
       if (accessToken == null || token == null) {
         return;
-      } 
-      const response = await keySpendLogsCall(accessToken=accessToken, token=token);
+      }
+      console.log(`accessToken: ${accessToken}; token: ${token}`);
+      const response = await keySpendLogsCall(
+        (accessToken = accessToken),
+        (token = token)
+      );
       console.log("Response:", response);
       // loop through response
       // get spend, startTime for each element, place in new array
 
-
-      const pricePerDay: Record<string, number> = (Object.values(response) as ResponseValueType[]).reduce((acc: Record<string, number>, value) => {
+      const pricePerDay: Record<string, number> = (
+        Object.values(response) as ResponseValueType[]
+      ).reduce((acc: Record<string, number>, value) => {
         const startTime = new Date(value.startTime);
-        const day = new Intl.DateTimeFormat('en-US', { day: '2-digit', month: 'short' }).format(startTime);
-      
+        const day = new Intl.DateTimeFormat("en-US", {
+          day: "2-digit",
+          month: "short",
+        }).format(startTime);
+
         acc[day] = (acc[day] || 0) + value.spend;
-      
+
         return acc;
       }, {});
-      
-      
+
       // sort pricePerDay by day
       // Convert object to array of key-value pairs
-        const pricePerDayArray = Object.entries(pricePerDay);
+      const pricePerDayArray = Object.entries(pricePerDay);
 
-        // Sort the array based on the date (key)
-        pricePerDayArray.sort(([aKey], [bKey]) => {
-            const dateA = new Date(aKey);
-            const dateB = new Date(bKey);
-            return dateA.getTime() - dateB.getTime();
-        });
-
-    // Convert the sorted array back to an object
-    const sortedPricePerDay = Object.fromEntries(pricePerDayArray);
+      // Sort the array based on the date (key)
+      pricePerDayArray.sort(([aKey], [bKey]) => {
+        const dateA = new Date(aKey);
+        const dateB = new Date(bKey);
+        return dateA.getTime() - dateB.getTime();
+      });
 
+      // Convert the sorted array back to an object
+      const sortedPricePerDay = Object.fromEntries(pricePerDayArray);
 
       console.log(sortedPricePerDay);
-    
-      const pricePerUser: Record<string, number> = (Object.values(response) as ResponseValueType[]).reduce((acc: Record<string, number>, value) => {
+
+      const pricePerUser: Record<string, number> = (
+        Object.values(response) as ResponseValueType[]
+      ).reduce((acc: Record<string, number>, value) => {
         const user = value.user;
         acc[user] = (acc[user] || 0) + value.spend;
-      
+
         return acc;
       }, {});
-      
-    
+
       console.log(pricePerDay);
       console.log(pricePerUser);
 
       const arrayBarChart = [];
-        // [
-        // {
-        //     "day": "02 Feb",
-        //     "spend": pricePerDay["02 Feb"],
-        // }
-        // ]
+      // [
+      // {
+      //     "day": "02 Feb",
+      //     "spend": pricePerDay["02 Feb"],
+      // }
+      // ]
       for (const [key, value] of Object.entries(sortedPricePerDay)) {
         arrayBarChart.push({ day: key, spend: value });
       }
 
-
       // get 5 most expensive users
-      const sortedUsers = Object.entries(pricePerUser).sort((a, b) => b[1] - a[1]);
+      const sortedUsers = Object.entries(pricePerUser).sort(
+        (a, b) => b[1] - a[1]
+      );
       const top5Users = sortedUsers.slice(0, 5);
-      const userChart = top5Users.map(([key, value]) => ({ name: key, value: value }));
-      
+      const userChart = top5Users.map(([key, value]) => ({
+        name: key,
+        value: value,
+      }));
+
       setData(arrayBarChart);
       setUserData(userChart);
       console.log("arrayBarChart:", arrayBarChart);
@@ -112,11 +153,10 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({ token, accessTo
     }
   };
 
-  useEffect(() => {
-    // Fetch data only when the token changes
-    fetchData();
-  }, [token]);  // Dependency array containing the 'token' variable
-
+  // useEffect(() => {
+  //   // Fetch data only when the token changes
+  //   fetchData();
+  // }, [token]); // Dependency array containing the 'token' variable
 
   if (!token) {
     return null;
@@ -134,33 +174,28 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({ token, accessTo
         onCancel={handleCancel}
         footer={null}
       >
-        <Title style={{ textAlign: 'left' }}>Key Name: {keyName}</Title>
+        <Title style={{ textAlign: "left" }}>Key Name: {keyName}</Title>
 
         <Metric>Monthly Spend ${keySpend}</Metric>
 
         <Card className="mt-6 mb-6">
-        {data && (
+          {data && (
             <BarChart
-                className="mt-6"
-                data={data}
-                colors={["green"]}
-                index="day"
-                categories={["spend"]}
-                yAxisWidth={48}
+              className="mt-6"
+              data={data}
+              colors={["green"]}
+              index="day"
+              categories={["spend"]}
+              yAxisWidth={48}
             />
-        )}
-    </Card>
-    <Title className="mt-6">Top 5 Users Spend (USD)</Title>
-    <Card className="mb-6">
-        {userData && (
-            <BarList
-            className="mt-6"
-            data={userData}
-            color="teal"
-            />
-        )}
-    </Card>
-
+          )}
+        </Card>
+        <Title className="mt-6">Top 5 Users Spend (USD)</Title>
+        <Card className="mb-6">
+          {userData && (
+            <BarList className="mt-6" data={userData} color="teal" />
+          )}
+        </Card>
       </Modal>
     </div>
   );
diff --git a/ui/litellm-dashboard/src/components/view_key_table.tsx b/ui/litellm-dashboard/src/components/view_key_table.tsx
index 8522a6bb16..4813bbe4e3 100644
--- a/ui/litellm-dashboard/src/components/view_key_table.tsx
+++ b/ui/litellm-dashboard/src/components/view_key_table.tsx
@@ -1,5 +1,5 @@
 "use client";
-import React, { useEffect } from "react";
+import React, { useEffect, useState } from "react";
 import { keyDeleteCall } from "./networking";
 import { StatusOnlineIcon, TrashIcon } from "@heroicons/react/outline";
 import {
@@ -32,6 +32,8 @@ const ViewKeyTable: React.FC<ViewKeyTableProps> = ({
   data,
   setData,
 }) => {
+  const [isButtonClicked, setIsButtonClicked] = useState(false);
+
   const handleDelete = async (token: String) => {
     if (data == null) {
       return;
@@ -116,8 +118,13 @@ const ViewKeyTable: React.FC<ViewKeyTableProps> = ({
                   />
                 </TableCell>
                 <TableCell>
-                  <ViewKeySpendReport token={item.token} accessToken={accessToken} keySpend={item.spend} keyBudget={item.max_budget} keyName={item.key_name} />
-                
+                  <ViewKeySpendReport
+                    token={item.token}
+                    accessToken={accessToken}
+                    keySpend={item.spend}
+                    keyBudget={item.max_budget}
+                    keyName={item.key_name}
+                  />
                 </TableCell>
               </TableRow>
             );

From 7b7f80b24df13eac0691f7ae7a8d99fd2866ea99 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 21:43:17 -0800
Subject: [PATCH 003/218] test(test_key_generate_dynamodb.py): fix test

---
 litellm/tests/test_key_generate_dynamodb.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/litellm/tests/test_key_generate_dynamodb.py b/litellm/tests/test_key_generate_dynamodb.py
index 61d0ff6a66..573bd944db 100644
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@@ -490,8 +490,13 @@ def test_dynamo_db_migration(custom_db_client):
     try:
 
         async def test():
+            request = GenerateKeyRequest(max_budget=1)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
             bearer_token = (
-                "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
+                "Bearer " + generated_key
             )  # this works with ishaan's db, it's a never expiring key
 
             request = Request(scope={"type": "http"})
@@ -508,4 +513,4 @@ def test_dynamo_db_migration(custom_db_client):
 
         asyncio.run(test())
     except Exception as e:
-        pytest.fail(f"An exception occurred - {str(e)}")
+        pytest.fail(f"An exception occurred - {traceback.format_exc()}")

From 32639bf398ac076c0829a4fa8da5ca65230b6391 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 09:21:03 -0800
Subject: [PATCH 004/218] fix(utils.py): return finish reason for last vertex
 ai chunk

---
 litellm/proxy/proxy_server.py | 31 ++++++++++++++++++++++++++++---
 litellm/utils.py              | 34 ++++++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 494c874147..5c336ea91e 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1746,7 +1746,33 @@ async def async_data_generator(response, user_api_key_dict):
         done_message = "[DONE]"
         yield f"data: {done_message}\n\n"
     except Exception as e:
-        yield f"data: {str(e)}\n\n"
+        traceback.print_exc()
+        await proxy_logging_obj.post_call_failure_hook(
+            user_api_key_dict=user_api_key_dict, original_exception=e
+        )
+        verbose_proxy_logger.debug(
+            f"\033[1;31mAn error occurred: {e}\n\n Debug this by setting `--debug`, e.g. `litellm --model gpt-3.5-turbo --debug`"
+        )
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        if user_debug:
+            traceback.print_exc()
+
+        if isinstance(e, HTTPException):
+            raise e
+        else:
+            error_traceback = traceback.format_exc()
+            error_msg = f"{str(e)}\n\n{error_traceback}"
+
+        raise ProxyException(
+            message=getattr(e, "message", error_msg),
+            type=getattr(e, "type", "None"),
+            param=getattr(e, "param", "None"),
+            code=getattr(e, "status_code", 500),
+        )
 
 
 def select_data_generator(response, user_api_key_dict):
@@ -1754,7 +1780,7 @@ def select_data_generator(response, user_api_key_dict):
         # since boto3 - sagemaker does not support async calls, we should use a sync data_generator
         if hasattr(
             response, "custom_llm_provider"
-        ) and response.custom_llm_provider in ["sagemaker", "together_ai"]:
+        ) and response.custom_llm_provider in ["sagemaker"]:
             return data_generator(
                 response=response,
             )
@@ -2239,7 +2265,6 @@ async def chat_completion(
             selected_data_generator = select_data_generator(
                 response=response, user_api_key_dict=user_api_key_dict
             )
-
             return StreamingResponse(
                 selected_data_generator,
                 media_type="text/event-stream",
diff --git a/litellm/utils.py b/litellm/utils.py
index 5ccb85ef05..31eeaacab4 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -169,6 +169,8 @@ def map_finish_reason(
         return "stop"
     elif finish_reason == "SAFETY":  # vertex ai
         return "content_filter"
+    elif finish_reason == "STOP":  # vertex ai
+        return "stop"
     return finish_reason
 
 
@@ -1305,7 +1307,7 @@ class Logging:
                         )
                     if callback == "langfuse":
                         global langFuseLogger
-                        verbose_logger.debug("reaches langfuse for logging!")
+                        verbose_logger.debug("reaches langfuse for success logging!")
                         kwargs = {}
                         for k, v in self.model_call_details.items():
                             if (
@@ -6706,7 +6708,13 @@ def exception_type(
                         message=f"VertexAIException - {error_str}",
                         model=model,
                         llm_provider="vertex_ai",
-                        response=original_exception.response,
+                        response=httpx.Response(
+                            status_code=429,
+                            request=httpx.Request(
+                                method="POST",
+                                url=" https://cloud.google.com/vertex-ai/",
+                            ),
+                        ),
                     )
                 elif (
                     "429 Quota exceeded" in error_str
@@ -8341,13 +8349,20 @@ class CustomStreamWrapper:
                 completion_obj["content"] = chunk.text
             elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"):
                 try:
-                    # print(chunk)
-                    if hasattr(chunk, "text"):
-                        # vertexAI chunks return
-                        # MultiCandidateTextGenerationResponse(text=' ```python\n# This Python code says "Hi" 100 times.\n\n# Create', _prediction_response=Prediction(predictions=[{'candidates': [{'content': ' ```python\n# This Python code says "Hi" 100 times.\n\n# Create', 'author': '1'}], 'citationMetadata': [{'citations': None}], 'safetyAttributes': [{'blocked': False, 'scores': None, 'categories': None}]}], deployed_model_id='', model_version_id=None, model_resource_name=None, explanations=None), is_blocked=False, safety_attributes={}, candidates=[ ```python
-                        # This Python code says "Hi" 100 times.
-                        # Create])
-                        completion_obj["content"] = chunk.text
+                    if hasattr(chunk, "candidates") == True:
+                        try:
+                            completion_obj["content"] = chunk.text
+                            if hasattr(chunk.candidates[0], "finish_reason"):
+                                model_response.choices[
+                                    0
+                                ].finish_reason = map_finish_reason(
+                                    chunk.candidates[0].finish_reason.name
+                                )
+                        except:
+                            if chunk.candidates[0].finish_reason.name == "SAFETY":
+                                raise Exception(
+                                    f"The response was blocked by VertexAI. {str(chunk)}"
+                                )
                     else:
                         completion_obj["content"] = str(chunk)
                 except StopIteration as e:
@@ -8636,7 +8651,6 @@ class CustomStreamWrapper:
                 or self.custom_llm_provider == "ollama_chat"
                 or self.custom_llm_provider == "vertex_ai"
             ):
-                print_verbose(f"INSIDE ASYNC STREAMING!!!")
                 print_verbose(
                     f"value of async completion stream: {self.completion_stream}"
                 )

From 97514b6bedaa306a81ab4dd7919d56e96890fe6c Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 12:57:05 -0800
Subject: [PATCH 005/218] fix(proxy_server.py): do a health check on db before
 returning if proxy ready (if db connected)

---
 litellm/proxy/proxy_server.py | 18 +++++++++---------
 litellm/proxy/utils.py        | 17 +++++++++++++++--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 494c874147..bd5b43f5f5 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -4051,16 +4051,16 @@ async def health_readiness():
         cache_type = litellm.cache.type
 
     if prisma_client is not None:  # if db passed in, check if it's connected
-        if prisma_client.db.is_connected() == True:
-            response_object = {"db": "connected"}
+        await prisma_client.health_check()  # test the db connection
+        response_object = {"db": "connected"}
 
-            return {
-                "status": "healthy",
-                "db": "connected",
-                "cache": cache_type,
-                "litellm_version": version,
-                "success_callbacks": litellm.success_callback,
-            }
+        return {
+            "status": "healthy",
+            "db": "connected",
+            "cache": cache_type,
+            "litellm_version": version,
+            "success_callbacks": litellm.success_callback,
+        }
     else:
         return {
             "status": "healthy",
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 84b09d7265..5c5b5b7727 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -472,8 +472,6 @@ class PrismaClient:
         reset_at: Optional[datetime] = None,
     ):
         try:
-            print_verbose("PrismaClient: get_data")
-
             response: Any = None
             if token is not None or (table_name is not None and table_name == "key"):
                 # check if plain text or hash
@@ -885,6 +883,21 @@ class PrismaClient:
             )
             raise e
 
+    async def health_check(self):
+        """
+        Health check endpoint for the prisma client
+        """
+        sql_query = """
+            SELECT 1
+            FROM "LiteLLM_VerificationToken"
+            LIMIT 1
+            """
+
+        # Execute the raw query
+        # The asterisk before `user_id_list` unpacks the list into separate arguments
+        response = await self.db.query_raw(sql_query)
+        return response
+
 
 class DBClient:
     """

From f57c054920994a57460a29e74d2d51b8eef53a8a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:57:20 -0800
Subject: [PATCH 006/218] (ci/cd) run in verbose mode

---
 .circleci/config.yml             | 2 +-
 litellm/tests/test_completion.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c1224159a1..9a29ed07ca 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 59ea06d9c98858e210939c5dd23b5f3a337d9256 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:27:24 -0800
Subject: [PATCH 007/218] (fix) rename proxy startup test

---
 litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py
similarity index 100%
rename from litellm/tests/test_proxy_startup.py
rename to litellm/tests/test_aproxy_startup.py

From 8d9c51b50ef6b2f0632b26c2c5224c968d5e9ca5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:38:57 -0800
Subject: [PATCH 008/218] (fix) proxy_startup test

---
 litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py
index a846c9f4a3..024d69b1ff 100644
--- a/litellm/tests/test_aproxy_startup.py
+++ b/litellm/tests/test_aproxy_startup.py
@@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config():
         from litellm._logging import verbose_proxy_logger, verbose_router_logger
         import logging
 
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
@@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 def test_proxy_gunicorn_startup_config_dict():
@@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict():
 
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 # test_proxy_gunicorn_startup()

From ddfcccda38a8df37e0539511ffffa6b6a218dc88 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:16 -0800
Subject: [PATCH 009/218] (ci/cd) run pytest without -s

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9a29ed07ca..c1224159a1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results

From f81f73f3b5341c073f91b9a16bdd57408133bdcf Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:24 -0800
Subject: [PATCH 010/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 86a44c4a3dc7436ca0df69a78579393ecf41386d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:43:28 -0800
Subject: [PATCH 011/218] (fix) parallel_request_limiter debug

---
 litellm/proxy/hooks/parallel_request_limiter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index ca60421a50..48cf5b7799 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                 "current_rpm": current["current_rpm"] + 1,
             }
 
-            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.print_verbose(
+                f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
+            )
             self.user_api_key_cache.set_cache(
                 request_count_api_key, new_val, ttl=60
             )  # store in cache for 1 min.

From d00ed06744a8e4fe22e5448ff280fa1255bdae9f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:44:30 -0800
Subject: [PATCH 012/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 528bb19d2a..bfac8ddeae 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -306,6 +306,10 @@ async def test_normal_router_call():
 
 @pytest.mark.asyncio
 async def test_normal_router_tpm_limit():
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)
     model_list = [
         {
             "model_name": "azure-model",
@@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit():
     current_minute = datetime.now().strftime("%M")
     precise_minute = f"{current_date}-{current_hour}-{current_minute}"
     request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+    print("Test: Checking current_requests for precise_minute=", precise_minute)
 
     assert (
         parallel_request_handler.user_api_key_cache.get_cache(
@@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit():
         model="azure-model",
         messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
         metadata={"user_api_key": _api_key},
+        mock_response="hello",
     )
     await asyncio.sleep(1)  # success is done in a separate thread
     print(f"response: {response}")

From d359465cccb5bdd7d4c5e5e45596c655f505adf9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:47:19 -0800
Subject: [PATCH 013/218] (ci/cd) fix test_config_no_auth

---
 .../test_configs/test_config_no_auth.yaml     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index ccebe016db..9d7aff5702 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,11 +9,21 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
+- litellm_params:
+    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
+    api_key: os.environ/AZURE_API_KEY
+    model: azure/chatgpt-v-2
+  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -26,8 +36,93 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
+  model_name: test_openai_models
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    mode: embedding
+  model_name: azure-embedding-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: f6f74e14-ac64-4403-9365-319e584dcdc5
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 9b1ef341-322c-410a-8992-903987fef439
+  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+  model_info:
+    mode: embedding
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
+    model: dall-e-3
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
+  model_info:
+    mode: image_generation
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    base_model: text-embedding-ada-002
+    mode: embedding
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
\ No newline at end of file

From 3fc1ff0c73467ad04210a1e9d6242fd8b93bd7d0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:53:47 -0800
Subject: [PATCH 014/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From f2b56be491cc8b879f5fdaeec9ed32fc3535daf1 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:02:36 -0800
Subject: [PATCH 015/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 50efa6a76dccec068a26fe4facbad9b0440d89c0 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:29 -0800
Subject: [PATCH 016/218] fix(utils.py): round max tokens to be int always

---
 litellm/tests/test_completion.py | 5 +++--
 litellm/utils.py                 | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..de79c97afa 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -544,13 +544,13 @@ def hf_test_completion_tgi():
 def test_completion_openai():
     try:
         litellm.set_verbose = True
+        litellm.drop_params = True
         print(f"api key: {os.environ['OPENAI_API_KEY']}")
         litellm.api_key = os.environ["OPENAI_API_KEY"]
         response = completion(
             model="gpt-3.5-turbo",
-            messages=messages,
+            messages=[{"role": "user", "content": "Hey"}],
             max_tokens=10,
-            request_timeout=1,
             metadata={"hi": "bye"},
         )
         print("This is the response object\n", response)
@@ -565,6 +565,7 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
+        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:
diff --git a/litellm/utils.py b/litellm/utils.py
index 5ccb85ef05..fdca57e51f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2348,7 +2348,9 @@ def client(original_function):
                     elif user_max_tokens + input_tokens > max_output_tokens:
                         user_max_tokens = max_output_tokens - input_tokens
                     print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = user_max_tokens
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
                 except Exception as e:
                     print_verbose(f"Error while checking max token limit: {str(e)}")
             # MODEL CALL

From c97bb22f909a6e76b277fbc1a471abe617fd0f67 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:49 -0800
Subject: [PATCH 017/218] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?=
 =?UTF-8?q?1.22.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17d80ae8ee..944aad7f8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.8"
+version = "1.22.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.8"
+version = "1.22.9"
 version_files = [
     "pyproject.toml:^version"
 ]

From 1f4b2e34b92c47034b4552bb334f5dad234261a1 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:35:46 -0800
Subject: [PATCH 018/218] build(requirements.txt): update the proxy
 requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c9bd0e511d..768e8dff3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
-google-generativeai==0.1.0 # for vertex ai calls
+google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 traceloop-sdk==0.5.3 # for open telemetry logging
 langfuse>=2.6.3 # for langfuse self-hosted logging

From 87a92aa65ebefe7752c2601a2de4954df815fc0e Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:11:43 -0800
Subject: [PATCH 019/218] fix(ollama.py): support format for ollama

---
 litellm/llms/ollama.py      | 10 +++++++++-
 litellm/llms/ollama_chat.py |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index d0bc24af4c..9339deb78d 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -146,7 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "prompt": prompt, "options": optional_params}
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
 
     ## LOGGING
     logging_obj.pre_call(
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index d1a439398b..0311931b13 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,12 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
     data = {
         "model": model,
         "messages": messages,
         "options": optional_params,
         "stream": stream,
     }
+    if format is not None:
+        data["format"] = format
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From 705f968136a61a0b25413dfdc5bda65d3e11d72f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:12:13 -0800
Subject: [PATCH 020/218] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=20?=
 =?UTF-8?q?1.22.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be8c8966be..17d80ae8ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.7"
+version = "1.22.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.7"
+version = "1.22.8"
 version_files = [
     "pyproject.toml:^version"
 ]

From f5f44e8bb9a246d2632c3246f0057810cab9ce66 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:57:20 -0800
Subject: [PATCH 021/218] (ci/cd) run in verbose mode

---
 .circleci/config.yml             | 2 +-
 litellm/tests/test_completion.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c1224159a1..9a29ed07ca 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 34937c23aed53bf062455c08835f2f3c75ed62f4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:27:24 -0800
Subject: [PATCH 022/218] (fix) rename proxy startup test

---
 litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py
similarity index 100%
rename from litellm/tests/test_proxy_startup.py
rename to litellm/tests/test_aproxy_startup.py

From 69064b033b925616671c3b574546520f2668e57a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:38:57 -0800
Subject: [PATCH 023/218] (fix) proxy_startup test

---
 litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py
index a846c9f4a3..024d69b1ff 100644
--- a/litellm/tests/test_aproxy_startup.py
+++ b/litellm/tests/test_aproxy_startup.py
@@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config():
         from litellm._logging import verbose_proxy_logger, verbose_router_logger
         import logging
 
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
@@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 def test_proxy_gunicorn_startup_config_dict():
@@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict():
 
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 # test_proxy_gunicorn_startup()

From a804fb7db8854d86981c5a027f50fbdfd2a2e7ce Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:16 -0800
Subject: [PATCH 024/218] (ci/cd) run pytest without -s

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9a29ed07ca..c1224159a1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results

From ef27d1293eab59915748dcdc1f6853b684e5cf68 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:24 -0800
Subject: [PATCH 025/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 29a6f8b44963c376213d53f66956d2328e36914b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:43:28 -0800
Subject: [PATCH 026/218] (fix) parallel_request_limiter debug

---
 litellm/proxy/hooks/parallel_request_limiter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index ca60421a50..48cf5b7799 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                 "current_rpm": current["current_rpm"] + 1,
             }
 
-            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.print_verbose(
+                f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
+            )
             self.user_api_key_cache.set_cache(
                 request_count_api_key, new_val, ttl=60
             )  # store in cache for 1 min.

From 61a4f4f948bf015084d30e5d3d78782df2c112f6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:44:30 -0800
Subject: [PATCH 027/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 528bb19d2a..bfac8ddeae 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -306,6 +306,10 @@ async def test_normal_router_call():
 
 @pytest.mark.asyncio
 async def test_normal_router_tpm_limit():
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)
     model_list = [
         {
             "model_name": "azure-model",
@@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit():
     current_minute = datetime.now().strftime("%M")
     precise_minute = f"{current_date}-{current_hour}-{current_minute}"
     request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+    print("Test: Checking current_requests for precise_minute=", precise_minute)
 
     assert (
         parallel_request_handler.user_api_key_cache.get_cache(
@@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit():
         model="azure-model",
         messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
         metadata={"user_api_key": _api_key},
+        mock_response="hello",
     )
     await asyncio.sleep(1)  # success is done in a separate thread
     print(f"response: {response}")

From e6fb8250557bdf68bf4478c0467ac5edff06a978 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:47:19 -0800
Subject: [PATCH 028/218] (ci/cd) fix test_config_no_auth

---
 .../test_configs/test_config_no_auth.yaml     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index ccebe016db..9d7aff5702 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,11 +9,21 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
+- litellm_params:
+    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
+    api_key: os.environ/AZURE_API_KEY
+    model: azure/chatgpt-v-2
+  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -26,8 +36,93 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
+  model_name: test_openai_models
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    mode: embedding
+  model_name: azure-embedding-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: f6f74e14-ac64-4403-9365-319e584dcdc5
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 9b1ef341-322c-410a-8992-903987fef439
+  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+  model_info:
+    mode: embedding
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
+    model: dall-e-3
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
+  model_info:
+    mode: image_generation
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    base_model: text-embedding-ada-002
+    mode: embedding
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
\ No newline at end of file

From 9bef2a94d0e2ec65dfd3891f73250b430cd77013 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:53:47 -0800
Subject: [PATCH 029/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From ee0f5793dc52fb97281c5879edb5126a80fac14a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:02:36 -0800
Subject: [PATCH 030/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 659a460923b83156c398d1e83915ee251c1bf3df Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:29 -0800
Subject: [PATCH 031/218] fix(utils.py): round max tokens to be int always

---
 litellm/tests/test_completion.py | 5 +++--
 litellm/utils.py                 | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..de79c97afa 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -544,13 +544,13 @@ def hf_test_completion_tgi():
 def test_completion_openai():
     try:
         litellm.set_verbose = True
+        litellm.drop_params = True
         print(f"api key: {os.environ['OPENAI_API_KEY']}")
         litellm.api_key = os.environ["OPENAI_API_KEY"]
         response = completion(
             model="gpt-3.5-turbo",
-            messages=messages,
+            messages=[{"role": "user", "content": "Hey"}],
             max_tokens=10,
-            request_timeout=1,
             metadata={"hi": "bye"},
         )
         print("This is the response object\n", response)
@@ -565,6 +565,7 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
+        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:
diff --git a/litellm/utils.py b/litellm/utils.py
index 31eeaacab4..62315b3d97 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2350,7 +2350,9 @@ def client(original_function):
                     elif user_max_tokens + input_tokens > max_output_tokens:
                         user_max_tokens = max_output_tokens - input_tokens
                     print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = user_max_tokens
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
                 except Exception as e:
                     print_verbose(f"Error while checking max token limit: {str(e)}")
             # MODEL CALL

From 955dbb179c152b05b53019becba7e222ee30c24f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:49 -0800
Subject: [PATCH 032/218] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?=
 =?UTF-8?q?1.22.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17d80ae8ee..944aad7f8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.8"
+version = "1.22.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.8"
+version = "1.22.9"
 version_files = [
     "pyproject.toml:^version"
 ]

From 03656ff1715883f2a1a11b6dc6402fb7e07384fa Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 17:07:57 -0800
Subject: [PATCH 033/218] Update model_prices_and_context_window.json

---
 model_prices_and_context_window.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index b6ded001c9..4c28bdbe8b 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From 6178b43af3dde78d314c5a8c32cb361a1d7c5b97 Mon Sep 17 00:00:00 2001
From: John HU <hszqqq12@gmail.com>
Date: Mon, 5 Feb 2024 17:30:39 -0800
Subject: [PATCH 034/218] Fix admin UI title and description

---
 ui/litellm-dashboard/src/app/layout.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ui/litellm-dashboard/src/app/layout.tsx b/ui/litellm-dashboard/src/app/layout.tsx
index 3314e4780a..a04a0d66ed 100644
--- a/ui/litellm-dashboard/src/app/layout.tsx
+++ b/ui/litellm-dashboard/src/app/layout.tsx
@@ -5,8 +5,8 @@ import "./globals.css";
 const inter = Inter({ subsets: ["latin"] });
 
 export const metadata: Metadata = {
-  title: "Create Next App",
-  description: "Generated by create next app",
+  title: "🚅 LiteLLM",
+  description: "LiteLLM Proxy Admin UI",
 };
 
 export default function RootLayout({

From e62081a681c6cab6b159542da4220bae49590dad Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 16:16:15 -0800
Subject: [PATCH 035/218] fix(langfuse.py): support logging failed llm api
 calls to langfuse

---
 litellm/integrations/langfuse.py | 198 +++++++++++++++++++------------
 litellm/utils.py                 |  58 ++++-----
 2 files changed, 151 insertions(+), 105 deletions(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index e62dccdc47..82de333660 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -55,8 +55,21 @@ class LangFuseLogger:
         else:
             self.upstream_langfuse = None
 
+    # def log_error(kwargs, response_obj, start_time, end_time):
+    #     generation = trace.generation(
+    #         level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
+    #         status_message='error' # can be any string (e.g. stringified stack trace or error body)
+    #     )
     def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self,
+        kwargs,
+        response_obj,
+        start_time,
+        end_time,
+        user_id,
+        print_verbose,
+        level="DEFAULT",
+        status_message=None,
     ):
         # Method definition
 
@@ -84,37 +97,49 @@ class LangFuseLogger:
                         pass
 
             # end of processing langfuse ########################
-            if kwargs.get("call_type", None) == "embedding" or isinstance(
-                response_obj, litellm.EmbeddingResponse
+            if (
+                level == "ERROR"
+                and status_message is not None
+                and isinstance(status_message, str)
+            ):
+                input = prompt
+                output = status_message
+            elif response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
             ):
                 input = prompt
                 output = response_obj["data"]
-            else:
+            elif response_obj is not None:
                 input = prompt
                 output = response_obj["choices"][0]["message"].json()
-            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
-            self._log_langfuse_v2(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-                print_verbose,
-            ) if self._is_langfuse_v2() else self._log_langfuse_v1(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-            )
+            print(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            if self._is_langfuse_v2():
+                self._log_langfuse_v2(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                    level,
+                    print_verbose,
+                )
+            elif response_obj is not None:
+                self._log_langfuse_v1(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                )
 
             self.Langfuse.flush()
             print_verbose(
@@ -123,15 +148,15 @@ class LangFuseLogger:
             verbose_logger.info(f"Langfuse Layer Logging - logging success")
         except:
             traceback.print_exc()
-            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
             pass
 
     async def _async_log_event(
         self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
     ):
-        self.log_event(
-            kwargs, response_obj, start_time, end_time, user_id, print_verbose
-        )
+        """
+        TODO: support async calls when langfuse is truly async
+        """
 
     def _is_langfuse_v2(self):
         import langfuse
@@ -193,57 +218,78 @@ class LangFuseLogger:
         optional_params,
         input,
         response_obj,
+        level,
         print_verbose,
     ):
         import langfuse
 
-        tags = []
-        supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
-        supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
+        try:
+            tags = []
+            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
+            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
 
-        print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
+            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
 
-        generation_name = metadata.get("generation_name", None)
-        if generation_name is None:
-            # just log `litellm-{call_type}` as the generation name
-            generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+            generation_name = metadata.get("generation_name", None)
+            if generation_name is None:
+                # just log `litellm-{call_type}` as the generation name
+                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
 
-        trace_params = {
-            "name": generation_name,
-            "input": input,
-            "output": output,
-            "user_id": metadata.get("trace_user_id", user_id),
-            "id": metadata.get("trace_id", None),
-            "session_id": metadata.get("session_id", None),
-        }
-        cost = kwargs["response_cost"]
-        print_verbose(f"trace: {cost}")
-        if supports_tags:
-            for key, value in metadata.items():
-                tags.append(f"{key}:{value}")
-            if "cache_hit" in kwargs:
-                tags.append(f"cache_hit:{kwargs['cache_hit']}")
-            trace_params.update({"tags": tags})
+            trace_params = {
+                "name": generation_name,
+                "input": input,
+                "user_id": metadata.get("trace_user_id", user_id),
+                "id": metadata.get("trace_id", None),
+                "session_id": metadata.get("session_id", None),
+            }
 
-        trace = self.Langfuse.trace(**trace_params)
+            if level == "ERROR":
+                trace_params["status_message"] = output
+            else:
+                trace_params["output"] = output
 
-        # get generation_id
-        generation_id = None
-        if response_obj.get("id", None) is not None:
-            generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-        trace.generation(
-            name=generation_name,
-            id=metadata.get("generation_id", generation_id),
-            startTime=start_time,
-            endTime=end_time,
-            model=kwargs["model"],
-            modelParameters=optional_params,
-            input=input,
-            output=output,
-            usage={
-                "prompt_tokens": response_obj["usage"]["prompt_tokens"],
-                "completion_tokens": response_obj["usage"]["completion_tokens"],
-                "total_cost": cost if supports_costs else None,
-            },
-            metadata=metadata,
-        )
+            cost = kwargs.get("response_cost", None)
+            print_verbose(f"trace: {cost}")
+            if supports_tags:
+                for key, value in metadata.items():
+                    tags.append(f"{key}:{value}")
+                if "cache_hit" in kwargs:
+                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
+                trace_params.update({"tags": tags})
+
+            trace = self.Langfuse.trace(**trace_params)
+
+            if level == "ERROR":
+                trace.generation(
+                    level="ERROR",  # can be any of DEBUG, DEFAULT, WARNING or ERROR
+                    status_message=output,  # can be any string (e.g. stringified stack trace or error body)
+                )
+                print(f"SUCCESSFULLY LOGGED ERROR")
+            else:
+                # get generation_id
+                generation_id = None
+                if (
+                    response_obj is not None
+                    and response_obj.get("id", None) is not None
+                ):
+                    generation_id = litellm.utils.get_logging_id(
+                        start_time, response_obj
+                    )
+                trace.generation(
+                    name=generation_name,
+                    id=metadata.get("generation_id", generation_id),
+                    startTime=start_time,
+                    endTime=end_time,
+                    model=kwargs["model"],
+                    modelParameters=optional_params,
+                    input=input,
+                    output=output,
+                    usage={
+                        "prompt_tokens": response_obj["usage"]["prompt_tokens"],
+                        "completion_tokens": response_obj["usage"]["completion_tokens"],
+                        "total_cost": cost if supports_costs else None,
+                    },
+                    metadata=metadata,
+                )
+        except Exception as e:
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
diff --git a/litellm/utils.py b/litellm/utils.py
index e56ba879f8..1e83a319f4 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1636,34 +1636,6 @@ class Logging:
                             end_time=end_time,
                             print_verbose=print_verbose,
                         )
-                if callback == "langfuse":
-                    global langFuseLogger
-                    print_verbose("reaches Async langfuse for logging!")
-                    kwargs = {}
-                    for k, v in self.model_call_details.items():
-                        if (
-                            k != "original_response"
-                        ):  # copy.deepcopy raises errors as this could be a coroutine
-                            kwargs[k] = v
-                    # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                    if self.stream:
-                        if "complete_streaming_response" not in kwargs:
-                            return
-                        else:
-                            print_verbose(
-                                "reaches Async langfuse for streaming logging!"
-                            )
-                            result = kwargs["complete_streaming_response"]
-                    if langFuseLogger is None:
-                        langFuseLogger = LangFuseLogger()
-                    await langFuseLogger._async_log_event(
-                        kwargs=kwargs,
-                        response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        user_id=kwargs.get("user", None),
-                        print_verbose=print_verbose,
-                    )
             except:
                 print_verbose(
                     f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
@@ -1788,9 +1760,37 @@ class Logging:
                             response_obj=result,
                             kwargs=self.model_call_details,
                         )
+                    elif callback == "langfuse":
+                        global langFuseLogger
+                        verbose_logger.debug("reaches langfuse for logging!")
+                        kwargs = {}
+                        for k, v in self.model_call_details.items():
+                            if (
+                                k != "original_response"
+                            ):  # copy.deepcopy raises errors as this could be a coroutine
+                                kwargs[k] = v
+                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
+                        if langFuseLogger is None or (
+                            self.langfuse_public_key != langFuseLogger.public_key
+                            and self.langfuse_secret != langFuseLogger.secret_key
+                        ):
+                            langFuseLogger = LangFuseLogger(
+                                langfuse_public_key=self.langfuse_public_key,
+                                langfuse_secret=self.langfuse_secret,
+                            )
+                        langFuseLogger.log_event(
+                            start_time=start_time,
+                            end_time=end_time,
+                            response_obj=None,
+                            user_id=kwargs.get("user", None),
+                            print_verbose=print_verbose,
+                            status_message=str(exception),
+                            level="ERROR",
+                            kwargs=self.model_call_details,
+                        )
                 except Exception as e:
                     print_verbose(
-                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
+                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {str(e)}"
                     )
                     print_verbose(
                         f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"

From 7a4045174c476e3cb7336ac5466197208482149e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:37:05 -0800
Subject: [PATCH 036/218] (docs) upperbound_key_generate_params

---
 docs/my-website/docs/proxy/virtual_keys.md       | 16 ++++++++++++++++
 .../model_prices_and_context_window_backup.json  |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
index dd5edc6da8..c51bfc0ac9 100644
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@@ -352,6 +352,22 @@ Request Params:
 }
 ```
 
+## Upperbound /key/generate params
+Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
+
+Set `litellm_settings:upperbound_key_generate_params`:
+```yaml
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
+```
+
+** Expected Behavior **
+
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound
+
 ## Default /key/generate params
 Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index b6ded001c9..4c28bdbe8b 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From 1b7cd40ab4827ef42d8e6dbcbecc0e7ae4b43ac7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:38:47 -0800
Subject: [PATCH 037/218] (feat) upperbound_key_generate_params

---
 litellm/__init__.py           |  1 +
 litellm/proxy/proxy_server.py | 69 +++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 3f2a1e4b4d..26b761c64a 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -146,6 +146,7 @@ suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 0fe6997eec..c2d3d194ae 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1401,6 +1401,26 @@ class ProxyConfig:
 proxy_config = ProxyConfig()
 
 
+def _duration_in_seconds(duration: str):
+    match = re.match(r"(\d+)([smhd]?)", duration)
+    if not match:
+        raise ValueError("Invalid duration format")
+
+    value, unit = match.groups()
+    value = int(value)
+
+    if unit == "s":
+        return value
+    elif unit == "m":
+        return value * 60
+    elif unit == "h":
+        return value * 3600
+    elif unit == "d":
+        return value * 86400
+    else:
+        raise ValueError("Unsupported duration unit")
+
+
 async def generate_key_helper_fn(
     duration: Optional[str],
     models: list,
@@ -1435,25 +1455,6 @@ async def generate_key_helper_fn(
     if token is None:
         token = f"sk-{secrets.token_urlsafe(16)}"
 
-    def _duration_in_seconds(duration: str):
-        match = re.match(r"(\d+)([smhd]?)", duration)
-        if not match:
-            raise ValueError("Invalid duration format")
-
-        value, unit = match.groups()
-        value = int(value)
-
-        if unit == "s":
-            return value
-        elif unit == "m":
-            return value * 60
-        elif unit == "h":
-            return value * 3600
-        elif unit == "d":
-            return value * 86400
-        else:
-            raise ValueError("Unsupported duration unit")
-
     if duration is None:  # allow tokens that never expire
         expires = None
     else:
@@ -2674,6 +2675,36 @@ async def generate_key_fn(
                 elif key == "metadata" and value == {}:
                     setattr(data, key, litellm.default_key_generate_params.get(key, {}))
 
+        # check if user set default key/generate params on config.yaml
+        if litellm.upperbound_key_generate_params is not None:
+            for elem in data:
+                # if key in litellm.upperbound_key_generate_params, use the min of value and litellm.upperbound_key_generate_params[key]
+                key, value = elem
+                if value is not None and key in litellm.upperbound_key_generate_params:
+                    # if value is float/int
+                    if key in [
+                        "max_budget",
+                        "max_parallel_requests",
+                        "tpm_limit",
+                        "rpm_limit",
+                    ]:
+                        if value > litellm.upperbound_key_generate_params[key]:
+                            # directly compare floats/ints
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+                    elif key == "budget_duration":
+                        # budgets are in 1s, 1m, 1h, 1d, 1m (30s, 30m, 30h, 30d, 30m)
+                        # compare the duration in seconds and max duration in seconds
+                        upperbound_budget_duration = _duration_in_seconds(
+                            duration=litellm.upperbound_key_generate_params[key]
+                        )
+                        user_set_budget_duration = _duration_in_seconds(duration=value)
+                        if user_set_budget_duration > upperbound_budget_duration:
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+
         data_json = data.json()  # type: ignore
 
         # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users

From 06388101665c82d2fc0e1f537fdbbeb0dc064ada Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:39:36 -0800
Subject: [PATCH 038/218] (test) test_upperbound_key_params

---
 litellm/tests/test_key_generate_prisma.py | 34 +++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py
index de26168591..b4c86afb25 100644
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@@ -1279,6 +1279,40 @@ async def test_default_key_params(prisma_client):
         pytest.fail(f"Got exception {e}")
 
 
+@pytest.mark.asyncio()
+async def test_upperbound_key_params(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is not null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    litellm.upperbound_key_generate_params = {
+        "max_budget": 0.001,
+        "budget_duration": "1m",
+    }
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest(
+            max_budget=200000,
+            budget_duration="30d",
+        )
+        key = await generate_key_fn(request)
+        generated_key = key.key
+
+        result = await info_key_fn(key=generated_key)
+        key_info = result["info"]
+        # assert it used the upper bound for max_budget, and budget_duration
+        assert key_info["max_budget"] == 0.001
+        assert key_info["budget_duration"] == "1m"
+
+        print(result)
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
 def test_get_bearer_token():
     from litellm.proxy.proxy_server import _get_bearer_token
 

From 20ea6f9481501b3de0fd4885d36858fa95f2b104 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:40:52 -0800
Subject: [PATCH 039/218] (feat) proxy - upperbound params /key/generate

---
 litellm/proxy/proxy_config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 874049a752..bd844bd7ba 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,6 +73,9 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
+  upperbound_key_generate_params:
+    max_budget: 100
+    duration: "30d"
   # cache: True     
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From e9248b377b3754808b4f76f1be76bb054c641b0d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:51:08 -0800
Subject: [PATCH 040/218] (fix) proxy startup test

---
 .../test_configs/test_config_no_auth.yaml     | 95 -------------------
 1 file changed, 95 deletions(-)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index 8441018e35..ccebe016db 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,21 +9,11 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
-- litellm_params:
-    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
-    api_key: os.environ/AZURE_API_KEY
-    model: azure/chatgpt-v-2
-  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -36,93 +26,8 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
-  model_name: test_openai_models
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    mode: embedding
-  model_name: azure-embedding-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: f6f74e14-ac64-4403-9365-319e584dcdc5
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 9b1ef341-322c-410a-8992-903987fef439
-  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
-- litellm_params:
-    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
-  model_info:
-    mode: embedding
-  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
-- litellm_params:
-    model: dall-e-3
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_SWEDEN_API_BASE
-    api_key: os.environ/AZURE_SWEDEN_API_KEY
-    api_version: 2023-12-01-preview
-    model: azure/dall-e-3-test
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-06-01-preview
-    model: azure/
-  model_info:
-    mode: image_generation
-  model_name: dall-e-2
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    base_model: text-embedding-ada-002
-    mode: embedding
-  model_name: text-embedding-ada-002
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
-  model_name: test_openai_models

From 1a6515e6d4582011f0d7cc3ae1db26e8bbb3e778 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:53:31 -0800
Subject: [PATCH 041/218] (ci/cd) print debug info for
 test_proxy_gunicorn_startup_config_dict

---
 litellm/tests/test_proxy_startup.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_proxy_startup.py
index 650e2f8a7a..a846c9f4a3 100644
--- a/litellm/tests/test_proxy_startup.py
+++ b/litellm/tests/test_proxy_startup.py
@@ -33,6 +33,11 @@ def test_proxy_gunicorn_startup_direct_config():
     Test both approaches
     """
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -48,6 +53,11 @@ def test_proxy_gunicorn_startup_direct_config():
 
 def test_proxy_gunicorn_startup_config_dict():
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"

From 572b85adf9df8978a577ab34ca332bfeb15648a9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 06:46:49 -0800
Subject: [PATCH 042/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 34dc0e3b57..528bb19d2a 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -379,6 +379,7 @@ async def test_normal_router_tpm_limit():
         )
 
     except Exception as e:
+        print("Exception on test_normal_router_tpm_limit", e)
         assert e.status_code == 429
 
 

From dcb61580f50d68352aec4c4359a007629bd2e512 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:22:54 -0800
Subject: [PATCH 043/218] fix(ollama_chat.py): fix ollama chat completion token
 counting

---
 litellm/llms/ollama_chat.py | 8 ++++++--
 litellm/utils.py            | 3 ---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 95ff8dfaa3..3628ae2903 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -320,11 +320,15 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                 model_response["choices"][0]["message"] = message
             else:
                 model_response["choices"][0]["message"] = response_json["message"]
+
             model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
+            model_response["model"] = "ollama_chat/" + data["model"]
             prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
             completion_tokens = response_json.get(
-                "eval_count", litellm.token_counter(text=response_json["message"])
+                "eval_count",
+                litellm.token_counter(
+                    text=response_json["message"]["content"], count_response_tokens=True
+                ),
             )
             model_response["usage"] = litellm.Usage(
                 prompt_tokens=prompt_tokens,
diff --git a/litellm/utils.py b/litellm/utils.py
index 1e83a319f4..8491a1d5e1 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -983,9 +983,6 @@ class Logging:
             verbose_logger.debug(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
-            verbose_logger.debug(
-                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
-            )
             if self.logger_fn and callable(self.logger_fn):
                 try:
                     self.logger_fn(

From 2a285e419f1ea5ec57a019c40630afa8ae6b0cb5 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:26:13 -0800
Subject: [PATCH 044/218] fix(utils.py): use print_verbose for statements, so
 debug can be seen when running sdk

---
 litellm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 8491a1d5e1..5ccb85ef05 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -980,7 +980,7 @@ class Logging:
             self.model_call_details["log_event_type"] = "post_api_call"
 
             # User Logging -> if you pass in a custom logging function
-            verbose_logger.debug(
+            print_verbose(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
             if self.logger_fn and callable(self.logger_fn):

From 5e49c3f6c00ecd94e7cc2c3f47cc452c5d91ac1f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:43:47 -0800
Subject: [PATCH 045/218] fix(ollama_chat.py): explicitly state if ollama call
 is streaming or not

---
 litellm/llms/ollama_chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 3628ae2903..d1a439398b 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,7 +146,12 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "messages": messages, "options": optional_params}
+    data = {
+        "model": model,
+        "messages": messages,
+        "options": optional_params,
+        "stream": stream,
+    }
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From 99800fe811661698d9c9e6799dd2cdf9d78b7dec Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:44:04 -0800
Subject: [PATCH 046/218] =?UTF-8?q?bump:=20version=201.22.6=20=E2=86=92=20?=
 =?UTF-8?q?1.22.7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 06dedbed63..be8c8966be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.6"
+version = "1.22.7"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.6"
+version = "1.22.7"
 version_files = [
     "pyproject.toml:^version"
 ]

From 64634b53f5be8ef57ae5b725a4c04fbeaab64bfe Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:35:46 -0800
Subject: [PATCH 047/218] build(requirements.txt): update the proxy
 requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c9bd0e511d..768e8dff3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
-google-generativeai==0.1.0 # for vertex ai calls
+google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 traceloop-sdk==0.5.3 # for open telemetry logging
 langfuse>=2.6.3 # for langfuse self-hosted logging

From c8dfc76984005b54165abf0970754514e5caef6a Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:11:43 -0800
Subject: [PATCH 048/218] fix(ollama.py): support format for ollama

---
 litellm/llms/ollama.py      | 10 +++++++++-
 litellm/llms/ollama_chat.py |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index d0bc24af4c..9339deb78d 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -146,7 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "prompt": prompt, "options": optional_params}
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
 
     ## LOGGING
     logging_obj.pre_call(
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index d1a439398b..0311931b13 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,12 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
     data = {
         "model": model,
         "messages": messages,
         "options": optional_params,
         "stream": stream,
     }
+    if format is not None:
+        data["format"] = format
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From 7b033d44f5ef80dcfd62520104bd1accdd843ee9 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:12:13 -0800
Subject: [PATCH 049/218] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=20?=
 =?UTF-8?q?1.22.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be8c8966be..17d80ae8ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.7"
+version = "1.22.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.7"
+version = "1.22.8"
 version_files = [
     "pyproject.toml:^version"
 ]

From e94bae4939ea2fba647e32deb0457bed7dd899c9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:57:20 -0800
Subject: [PATCH 050/218] (ci/cd) run in verbose mode

---
 .circleci/config.yml             | 2 +-
 litellm/tests/test_completion.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c1224159a1..9a29ed07ca 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From bba33c1146bee64bc1f0e0e98955d03634035f9c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:27:24 -0800
Subject: [PATCH 051/218] (fix) rename proxy startup test

---
 litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py
similarity index 100%
rename from litellm/tests/test_proxy_startup.py
rename to litellm/tests/test_aproxy_startup.py

From 3f20a946678db0260a625d1749bf206674232d0d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:38:57 -0800
Subject: [PATCH 052/218] (fix) proxy_startup test

---
 litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py
index a846c9f4a3..024d69b1ff 100644
--- a/litellm/tests/test_aproxy_startup.py
+++ b/litellm/tests/test_aproxy_startup.py
@@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config():
         from litellm._logging import verbose_proxy_logger, verbose_router_logger
         import logging
 
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
@@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 def test_proxy_gunicorn_startup_config_dict():
@@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict():
 
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 # test_proxy_gunicorn_startup()

From 97cb9756a7e796af2acf5953db9709864f5c442b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:16 -0800
Subject: [PATCH 053/218] (ci/cd) run pytest without -s

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9a29ed07ca..c1224159a1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results

From 013b8beef6d451cebcd145fa7d0bdb66dbcb3b1c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:24 -0800
Subject: [PATCH 054/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 742d9aa461f0321dc0f4f2879c651883f18235c4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:43:28 -0800
Subject: [PATCH 055/218] (fix) parallel_request_limiter debug

---
 litellm/proxy/hooks/parallel_request_limiter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index ca60421a50..48cf5b7799 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                 "current_rpm": current["current_rpm"] + 1,
             }
 
-            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.print_verbose(
+                f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
+            )
             self.user_api_key_cache.set_cache(
                 request_count_api_key, new_val, ttl=60
             )  # store in cache for 1 min.

From c098c920aa068acdb300f2955e3e92b03aa6cce8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:44:30 -0800
Subject: [PATCH 056/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 528bb19d2a..bfac8ddeae 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -306,6 +306,10 @@ async def test_normal_router_call():
 
 @pytest.mark.asyncio
 async def test_normal_router_tpm_limit():
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)
     model_list = [
         {
             "model_name": "azure-model",
@@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit():
     current_minute = datetime.now().strftime("%M")
     precise_minute = f"{current_date}-{current_hour}-{current_minute}"
     request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+    print("Test: Checking current_requests for precise_minute=", precise_minute)
 
     assert (
         parallel_request_handler.user_api_key_cache.get_cache(
@@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit():
         model="azure-model",
         messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
         metadata={"user_api_key": _api_key},
+        mock_response="hello",
     )
     await asyncio.sleep(1)  # success is done in a separate thread
     print(f"response: {response}")

From d05f33e98e94d90629519914efe6d43876504f7f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:47:19 -0800
Subject: [PATCH 057/218] (ci/cd) fix test_config_no_auth

---
 .../test_configs/test_config_no_auth.yaml     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index ccebe016db..9d7aff5702 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,11 +9,21 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
+- litellm_params:
+    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
+    api_key: os.environ/AZURE_API_KEY
+    model: azure/chatgpt-v-2
+  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -26,8 +36,93 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
+  model_name: test_openai_models
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    mode: embedding
+  model_name: azure-embedding-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: f6f74e14-ac64-4403-9365-319e584dcdc5
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 9b1ef341-322c-410a-8992-903987fef439
+  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+  model_info:
+    mode: embedding
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
+    model: dall-e-3
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
+  model_info:
+    mode: image_generation
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    base_model: text-embedding-ada-002
+    mode: embedding
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
\ No newline at end of file

From 6e83e30ddb286d9ac8a7e6f6d1ce173f888972db Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:53:47 -0800
Subject: [PATCH 058/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 2187015ebd1e8acc7a0023bffa54384f8defc063 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:02:36 -0800
Subject: [PATCH 059/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 2ad55fd482ed8181d9b96c8be0fa7fdaf0a978d1 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:29 -0800
Subject: [PATCH 060/218] fix(utils.py): round max tokens to be int always

---
 litellm/tests/test_completion.py | 5 +++--
 litellm/utils.py                 | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..de79c97afa 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -544,13 +544,13 @@ def hf_test_completion_tgi():
 def test_completion_openai():
     try:
         litellm.set_verbose = True
+        litellm.drop_params = True
         print(f"api key: {os.environ['OPENAI_API_KEY']}")
         litellm.api_key = os.environ["OPENAI_API_KEY"]
         response = completion(
             model="gpt-3.5-turbo",
-            messages=messages,
+            messages=[{"role": "user", "content": "Hey"}],
             max_tokens=10,
-            request_timeout=1,
             metadata={"hi": "bye"},
         )
         print("This is the response object\n", response)
@@ -565,6 +565,7 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
+        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:
diff --git a/litellm/utils.py b/litellm/utils.py
index 5ccb85ef05..fdca57e51f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2348,7 +2348,9 @@ def client(original_function):
                     elif user_max_tokens + input_tokens > max_output_tokens:
                         user_max_tokens = max_output_tokens - input_tokens
                     print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = user_max_tokens
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
                 except Exception as e:
                     print_verbose(f"Error while checking max token limit: {str(e)}")
             # MODEL CALL

From 4302e276e943cfaf95c63d334f00910de46a7efe Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:49 -0800
Subject: [PATCH 061/218] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?=
 =?UTF-8?q?1.22.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17d80ae8ee..944aad7f8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.8"
+version = "1.22.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.8"
+version = "1.22.9"
 version_files = [
     "pyproject.toml:^version"
 ]

From bdb1f596d5aa56783f9123dc5c9c2050601c8e5d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:09:48 -0800
Subject: [PATCH 062/218] (feat) show langfuse logging tags better through
 proxy

---
 litellm/integrations/langfuse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 82de333660..3c3e793dfb 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -252,8 +252,14 @@ class LangFuseLogger:
             print_verbose(f"trace: {cost}")
             if supports_tags:
                 for key, value in metadata.items():
-                    tags.append(f"{key}:{value}")
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                    ]:
+                        tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
                     tags.append(f"cache_hit:{kwargs['cache_hit']}")
                 trace_params.update({"tags": tags})
 

From bf020fcf33ba05a1b48b971e364d3266c769c51c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 063/218] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a9..e1ef95dc34 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ class InMemoryCache(BaseCache):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ class RedisCache(BaseCache):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ class DualCache(BaseCache):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80f..32904ab784 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From dcd091164d945c001614438ef4e0465edc75d364 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 064/218] (feat) working - sync semantic caching

---
 litellm/caching.py | 227 ++++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 75 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc34..0a1046f0d8 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
 
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
 
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
     def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
-        try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+        import numpy as np
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            # create embedding response
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        # Add more data
+        keys = self.index.load(new_data)
 
-            # print("Stored Redis Value: ", stored_redis_value)
-
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ class Cache:
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ class Cache:
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ class Cache:
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ class Cache:
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From d7116c4c5c3321dc33ddc8c9c68197fa9d349c03 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 065/218] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab784..3ac812cf35 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From ab4b31d45baaafe93128beaafb9f7da28f920c0e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 066/218] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf35..4b47614cca 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From 705531da10cfb7da496b5b97e4d562ec06357287 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 067/218] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d8..877f935fab 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache):
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From 30a209223bb74efa914c88eae7a717424e913367 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 068/218] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fab..ad37f2077c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache):
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache):
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ class Cache:
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ class Cache:
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ class Cache:
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From c9c3dbf3d43d768e2bae7c7b7c2a4bdae3adc2a4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 069/218] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614cca..a1a42ff659 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From 6d7909685282bff82ba12fbd12f1ef9ee8d1af95 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 070/218] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077c..a7958d074c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache):
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From 268cec0db111fca9af514a2cddb3667f0dc43948 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 071/218] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index fdca57e51f..c25572c03c 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2533,6 +2533,14 @@ def client(original_function):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From 4fd38dd9448b7cced755f13fb712d6a3c1c0cbb6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 072/218] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index bd844bd7ba..41c3b41828 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,10 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  upperbound_key_generate_params:
-    max_budget: 100
-    duration: "30d"
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From af75d1076e6d80492bbcd6ef50bd61b887b0eba2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 073/218] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff2..4730fc28b1 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 768e8dff3f..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From d0bc5f984c82e6770fe4891a7b000427529b926e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 074/218] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074c..133d1db6dd 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ class Cache:
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From d5b500c0f17dee2c8b54b814458d686440bbdf7c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 075/218] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6dd..6bf53ea451 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache):
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache):
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ class Cache:
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ class Cache:
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From 157c8d05429a74c482f1a4942210ab91dab26440 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 076/218] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b41828..326544f41e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 51e0dd3471db39c694d4bf37190a889fe68e16ec Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 077/218] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed34..3f26878241 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From 48be4a2695e48ced9277db509723108d71d74941 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 078/218] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index bd5b43f5f5..046bc71b05 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1168,7 +1168,7 @@ class ProxyConfig:
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1195,6 +1195,9 @@ class ProxyConfig:
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 7df0a10361e13702e029a3980361180af45abc9b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 079/218] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff659..cc18dda165 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From 014a7833418fbd02590bf7d4f9a83bddb8476690 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 080/218] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087c..7b21d35b6c 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From 71cb2af495bfab66075e172c576f8ea274014c04 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 081/218] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6c..75e1db9557 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f26878241..d5b589e5c2 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From 0162a3e9f4afb1cf0076202c9cee76c28362165f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 082/218] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda165..96fd8eb9d2 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From f727f987d2120ea8c1465dde47eb90357da0cf50 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 083/218] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d2..6cb5b974a1 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True

From 23c684496e418d07cfb515f6a1457f8f1ac8fab2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:26:48 -0800
Subject: [PATCH 084/218] (ci/cd) run again

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 6cb5b974a1..8433941e90 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion():
 
     random_number = random.randint(
         1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
+    )  # add a random number to ensure it's always adding /reading from cache
 
     print("testing semantic caching")
     litellm.cache = Cache(

From 01c46ce192fc3bf3b5190d312b94e9937e793a89 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:29:31 -0800
Subject: [PATCH 085/218] test(test_completion.py): fix test

---
 docs/my-website/docs/proxy/caching.md | 7 ++++---
 litellm/tests/test_completion.py      | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index d5b589e5c2..2b385de8e5 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -211,9 +211,10 @@ litellm_settings:
 
 The proxy support 3 cache-controls:
 
-- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
-- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index de79c97afa..b075e48190 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -565,7 +565,6 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
-        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:

From a13a45896a87ed9018c4fa2e5b14d4355d427cc0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:09:48 -0800
Subject: [PATCH 086/218] (feat) show langfuse logging tags better through
 proxy

---
 litellm/integrations/langfuse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 82de333660..3c3e793dfb 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -252,8 +252,14 @@ class LangFuseLogger:
             print_verbose(f"trace: {cost}")
             if supports_tags:
                 for key, value in metadata.items():
-                    tags.append(f"{key}:{value}")
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                    ]:
+                        tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
                     tags.append(f"cache_hit:{kwargs['cache_hit']}")
                 trace_params.update({"tags": tags})
 

From bac5b40248b5af0441d44bda89b166325e05cd27 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 087/218] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a9..e1ef95dc34 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ class InMemoryCache(BaseCache):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ class RedisCache(BaseCache):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ class DualCache(BaseCache):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80f..32904ab784 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From 8c49cf0bbecbb31b7ecea4b057e948243c617a2a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 088/218] (feat) working - sync semantic caching

---
 litellm/caching.py | 227 ++++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 75 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc34..0a1046f0d8 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
 
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
 
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
     def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
-        try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+        import numpy as np
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            # create embedding response
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        # Add more data
+        keys = self.index.load(new_data)
 
-            # print("Stored Redis Value: ", stored_redis_value)
-
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ class Cache:
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ class Cache:
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ class Cache:
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ class Cache:
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From ff70e0ea66ad2df9ab773752f5ab40b90f490531 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 089/218] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab784..3ac812cf35 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From 2adf88df240c5efe1a8dc05c586a206a07019987 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 090/218] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf35..4b47614cca 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From 2084d0e0403dc6119a1348bdf8a106b64624df58 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 091/218] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d8..877f935fab 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache):
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From bc450487d07e1a959449c15157fe13950822b8a6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 092/218] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fab..ad37f2077c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache):
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache):
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ class Cache:
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ class Cache:
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ class Cache:
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From f706b42926d1ce8933fbfe46b74a99bee4d88bff Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 093/218] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614cca..a1a42ff659 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From 7166d63d8700ed203cb43d33b793434c3ad50806 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 094/218] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077c..a7958d074c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache):
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From e2ccdb7a1b22ba804f0d40968baaa49569084412 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 095/218] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 62315b3d97..b37c68d655 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2535,6 +2535,14 @@ def client(original_function):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From e8fbbd0722f0a8f660725e9d97a22975525e4a91 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 096/218] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index bd844bd7ba..41c3b41828 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,10 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  upperbound_key_generate_params:
-    max_budget: 100
-    duration: "30d"
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From 6b83d0219e720d1be129c64f952f72c7a8262352 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 097/218] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff2..4730fc28b1 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 768e8dff3f..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From a900b0128be71b3f6b71f8366b661096a083ca84 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 098/218] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074c..133d1db6dd 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ class Cache:
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From 1ac003e8fe11983f87e02c791938acb60861fcf6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 099/218] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6dd..6bf53ea451 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache):
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache):
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ class Cache:
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ class Cache:
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From d92f2d777f813f00208dbbd6c7b45b6f8e1823e7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 100/218] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b41828..326544f41e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 0e7fe751a556b5d3ac2ec2df28b744a6241b1a48 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 101/218] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed34..3f26878241 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From f8472fe3cfd0ead1aa3c302bd37a16f18d353d62 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 102/218] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 5c336ea91e..30233fc137 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1168,7 +1168,7 @@ class ProxyConfig:
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1195,6 +1195,9 @@ class ProxyConfig:
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From d6a76da74ebdf57450e8538fe5cd87c6ed9afa20 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 103/218] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff659..cc18dda165 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From e96c494c5b20b36c885ae68f27ba8a38cf128e2c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 104/218] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087c..7b21d35b6c 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From 842b0fd9cc72801beb088e0a332a60b7dee77911 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 105/218] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6c..75e1db9557 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f26878241..d5b589e5c2 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From 583cae96b2a4e10c256de7fcf3a99f707ce468dc Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 106/218] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda165..96fd8eb9d2 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From bca633d3235791cd82570d081c17679508ed5886 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 107/218] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d2..6cb5b974a1 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True

From e564cf6869a517481fa9baaa72fd388e30bccc7a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:26:48 -0800
Subject: [PATCH 108/218] (ci/cd) run again

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 6cb5b974a1..8433941e90 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion():
 
     random_number = random.randint(
         1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
+    )  # add a random number to ensure it's always adding /reading from cache
 
     print("testing semantic caching")
     litellm.cache = Cache(

From 672ba8fb12fe9f48cb0f7015e9d071dddff91e91 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:29:31 -0800
Subject: [PATCH 109/218] test(test_completion.py): fix test

---
 docs/my-website/docs/proxy/caching.md | 7 ++++---
 litellm/tests/test_completion.py      | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index d5b589e5c2..2b385de8e5 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -211,9 +211,10 @@ litellm_settings:
 
 The proxy support 3 cache-controls:
 
-- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
-- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index de79c97afa..b075e48190 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -565,7 +565,6 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
-        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:

From 9851c3a370aef5c265dd3ff0e118673ce5002d4b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:09:48 -0800
Subject: [PATCH 110/218] (feat) show langfuse logging tags better through
 proxy

---
 litellm/integrations/langfuse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 82de333660..3c3e793dfb 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -252,8 +252,14 @@ class LangFuseLogger:
             print_verbose(f"trace: {cost}")
             if supports_tags:
                 for key, value in metadata.items():
-                    tags.append(f"{key}:{value}")
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                    ]:
+                        tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
                     tags.append(f"cache_hit:{kwargs['cache_hit']}")
                 trace_params.update({"tags": tags})
 

From 89c773c2f582f4c01b638d61b5b2787ce835c4d4 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 111/218] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a9..e1ef95dc34 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ class InMemoryCache(BaseCache):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ class RedisCache(BaseCache):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ class DualCache(BaseCache):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80f..32904ab784 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From 2d5bb44115104487477b88d0de6eebe749564ea2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 112/218] (feat) working - sync semantic caching

---
 litellm/caching.py | 227 ++++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 75 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc34..0a1046f0d8 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
 
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
 
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
     def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
-        try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+        import numpy as np
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            # create embedding response
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        # Add more data
+        keys = self.index.load(new_data)
 
-            # print("Stored Redis Value: ", stored_redis_value)
-
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ class Cache:
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ class Cache:
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ class Cache:
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ class Cache:
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From 720efdcf5a9bd921d671f8860c59b4ef304ed367 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 113/218] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab784..3ac812cf35 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From 8a2dfcc9d4dec61b081ef33b5d934a2106192fe9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 114/218] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf35..4b47614cca 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From 1788522101f6fbcb7a39d3c3c3cdc1a4abc6d6c8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 115/218] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d8..877f935fab 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache):
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From aaa14e266d68736fef48f5a8cc74c6a195f48182 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 116/218] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fab..ad37f2077c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache):
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache):
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ class Cache:
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ class Cache:
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ class Cache:
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From 10a3e8fffd0958570eaef22c658891b5ed076b69 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 117/218] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614cca..a1a42ff659 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From 9523bf97370c43bd5ccb6293ff3d090bf5ebb26c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 118/218] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077c..a7958d074c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache):
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From 915075b90eeb9c51e5bd690fabb9469be231a713 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 119/218] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index fdca57e51f..c25572c03c 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2533,6 +2533,14 @@ def client(original_function):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From cbe29711e2cf82b7c662f0d67ce42b70be781711 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 120/218] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index bd844bd7ba..41c3b41828 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,10 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  upperbound_key_generate_params:
-    max_budget: 100
-    duration: "30d"
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From 8969f54632824a6d70d62491cb20de54285cfaeb Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 121/218] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff2..4730fc28b1 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 768e8dff3f..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 1fcee3bdde7f3eb17dfd3e394aea37bf533949dd Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 122/218] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074c..133d1db6dd 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ class Cache:
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From 0a30c19db010c0af3e69137b486985c551773aef Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 123/218] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6dd..6bf53ea451 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache):
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache):
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ class Cache:
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ class Cache:
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From c29320a1392eef3fe3b4aca606637320cf9fcf05 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 124/218] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b41828..326544f41e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 8a552a1cb6680c1d15574d0aafeb9827d1ec85a6 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 125/218] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed34..3f26878241 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From bb4889f04dda97b40dc43b809c95e95bb79a068f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 126/218] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index c2d3d194ae..6f442f1ae3 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1178,7 +1178,7 @@ class ProxyConfig:
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1205,6 +1205,9 @@ class ProxyConfig:
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 0ea74446c1d069e2998cc150334eb7fb9f98c46c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 127/218] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff659..cc18dda165 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From c5d81c5c6d815a4ee571ba363626d341904a5347 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 128/218] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087c..7b21d35b6c 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From daf388b04f9a410d645a8c53c3d4dc4a2c6364cc Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 129/218] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6c..75e1db9557 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f26878241..d5b589e5c2 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From f30de227a1b707942ffbec2ff6b4d74be3095f51 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 130/218] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda165..96fd8eb9d2 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From b180106c0e59bdb769bb70643cdc49091aad64f5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 131/218] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d2..6cb5b974a1 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True

From 731b59878e968fbce02f534cdf26f5306a38979e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:26:48 -0800
Subject: [PATCH 132/218] (ci/cd) run again

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 6cb5b974a1..8433941e90 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion():
 
     random_number = random.randint(
         1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
+    )  # add a random number to ensure it's always adding /reading from cache
 
     print("testing semantic caching")
     litellm.cache = Cache(

From fdab7b2bc765e30bb1c8a2ba0e5f7cb9427ad206 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:29:31 -0800
Subject: [PATCH 133/218] test(test_completion.py): fix test

---
 docs/my-website/docs/proxy/caching.md | 7 ++++---
 litellm/tests/test_completion.py      | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index d5b589e5c2..2b385de8e5 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -211,9 +211,10 @@ litellm_settings:
 
 The proxy support 3 cache-controls:
 
-- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
-- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index de79c97afa..b075e48190 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -565,7 +565,6 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
-        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:

From 71da1b1e6964e7a3fda14fd7d72a3dfd500b7bde Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 134/218] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 326544f41e..41c3b41828 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_embedding_model: azure-embedding-model
+    redis_semantic_cache_use_async: True
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 6b631a6c3a18acb5969338a7a5f02478d76af96c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 135/218] (fix) add redisvl==0.0.7

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3ace5872ad..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
-numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From a3fac3db12832c7da82dc4d9e7eb7c0361902dff Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 136/218] (feat) redis-semantic cache on proxy

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 9bf57170e72e252508bc2388d27dc7a881cfa31c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:35:25 -0800
Subject: [PATCH 137/218] (feat) show semantic-cache on health/readiness

---
 litellm/caching.py            |  3 +++
 litellm/proxy/proxy_server.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/litellm/caching.py b/litellm/caching.py
index 6bf53ea451..f996a58735 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -531,6 +531,9 @@ class RedisSemanticCache(BaseCache):
             return None
         pass
 
+    async def _index_info(self):
+        return await self.index.ainfo()
+
 
 class S3Cache(BaseCache):
     def __init__(
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 046bc71b05..7ac1521ba5 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -4051,8 +4051,18 @@ async def health_readiness():
 
     cache_type = None
     if litellm.cache is not None:
+        from litellm.caching import RedisSemanticCache
+
         cache_type = litellm.cache.type
 
+        if isinstance(litellm.cache.cache, RedisSemanticCache):
+            # ping the cache
+            try:
+                index_info = await litellm.cache.cache._index_info()
+            except Exception as e:
+                index_info = "index does not exist - error: " + str(e)
+            cache_type = {"type": cache_type, "index_info": index_info}
+
     if prisma_client is not None:  # if db passed in, check if it's connected
         await prisma_client.health_check()  # test the db connection
         response_object = {"db": "connected"}

From 17185976c42ed1ee2c7b2a100c291b2d6e2b5ed7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:36:35 -0800
Subject: [PATCH 138/218] (fix) dockerfile requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3ace5872ad..f2bff2680b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ pyyaml>=6.0.1 # server dep
 uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
-redis==4.6.0 # caching
+redis==5.0.0 # caching
 redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db

From 737111e620b6c0d6177e6b21d1f31a2138b8264a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:41:22 -0800
Subject: [PATCH 139/218] (fix) langfuse show semantic-similarity in tags

---
 litellm/integrations/langfuse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 3c3e793dfb..3031868ec7 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -255,6 +255,7 @@ class LangFuseLogger:
                     if key in [
                         "user_api_key",
                         "user_api_key_user_id",
+                        "semantic-similarity",
                     ]:
                         tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:

From 22a65638695dccb5727bf70965f374e26c7e434a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:52:32 -0800
Subject: [PATCH 140/218] (fix) redisvl requirements.txt issue

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f2bff2680b..55c5f14568 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==5.0.0 # caching
-redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions

From 2888b11a06b6022ad59a151bb79834f44bc0b6d9 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:55:51 -0800
Subject: [PATCH 141/218] refactor(main.py): trigger deploy

n
---
 litellm/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index 384dadc32d..b18221607f 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -10,7 +10,6 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx

From 7e46156da2cdeda82179c39cb494b7a9e635501d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 142/218] (fix) add redisvl==0.0.7

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 55c5f14568..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,8 +8,8 @@ pyyaml>=6.0.1 # server dep
 uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
-redis==5.0.0 # caching
-numpy==1.24.3 # semantic caching
+redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 540a6068d6f93f9d7e8181231f5f0428d62142b2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 143/218] (feat) redis-semantic cache on proxy

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From ad4b6be3ee9731642f02fceefb4b4c1d9e834d74 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 14:00:27 -0800
Subject: [PATCH 144/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index b075e48190..80a4372a57 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 9ebcf3c94496435fce2f4154f3c70616a47b35d3 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 14:10:56 -0800
Subject: [PATCH 145/218] build(requirements.txt): fix dependency

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3ace5872ad..f78d766ee3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
-redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions

From 589d2ed96547bd9ca9c1bb997be54a181259066f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 14:36:24 -0800
Subject: [PATCH 146/218] test(test_key_generate_dynamodb.py): fix test

---
 litellm/tests/test_key_generate_dynamodb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_key_generate_dynamodb.py b/litellm/tests/test_key_generate_dynamodb.py
index 61d0ff6a66..e77dc74723 100644
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@@ -490,8 +490,13 @@ def test_dynamo_db_migration(custom_db_client):
     try:
 
         async def test():
+            request = GenerateKeyRequest(max_budget=1)
+            key = await generate_key_fn(request)
+            print(key)
+
+            generated_key = key.key
             bearer_token = (
-                "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
+                "Bearer " + generated_key
             )  # this works with ishaan's db, it's a never expiring key
 
             request = Request(scope={"type": "http"})

From ef20536aa06013a4bec176a4210d3989e161760d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 15:13:59 -0800
Subject: [PATCH 147/218] (Feat) support max_user_budget

---
 litellm/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 26b761c64a..6a0cb95ae6 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -148,6 +148,7 @@ s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
 upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
+max_user_budget: Optional[float] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
 num_retries: Optional[int] = None  # per model endpoint

From 33aee6ba836ed29701951b5b3760419ea618d46d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 15:16:20 -0800
Subject: [PATCH 148/218] (feat) max_user_budget

---
 litellm/proxy/proxy_config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index a8144e9d48..7d774d9105 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -68,6 +68,7 @@ litellm_settings:
   fallbacks: [{"openai-gpt-3.5": ["azure-gpt-3.5"]}]
   success_callback: ['langfuse']
   max_budget: 10      # global budget for proxy 
+  max_user_budget: 0.0001
   budget_duration: 30d    # global budget duration, will reset after 30d
   default_key_generate_params:
     max_budget: 1.5000

From 0440e50a16813ac0e6c1b0f34f9b99a3fe9c1b70 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 17:07:57 -0800
Subject: [PATCH 149/218] Update model_prices_and_context_window.json

---
 model_prices_and_context_window.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index b6ded001c9..4c28bdbe8b 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From 77516b05de0c5b16676b25360ee102667969c85a Mon Sep 17 00:00:00 2001
From: John HU <hszqqq12@gmail.com>
Date: Mon, 5 Feb 2024 17:30:39 -0800
Subject: [PATCH 150/218] Fix admin UI title and description

---
 ui/litellm-dashboard/src/app/layout.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ui/litellm-dashboard/src/app/layout.tsx b/ui/litellm-dashboard/src/app/layout.tsx
index 3314e4780a..a04a0d66ed 100644
--- a/ui/litellm-dashboard/src/app/layout.tsx
+++ b/ui/litellm-dashboard/src/app/layout.tsx
@@ -5,8 +5,8 @@ import "./globals.css";
 const inter = Inter({ subsets: ["latin"] });
 
 export const metadata: Metadata = {
-  title: "Create Next App",
-  description: "Generated by create next app",
+  title: "🚅 LiteLLM",
+  description: "LiteLLM Proxy Admin UI",
 };
 
 export default function RootLayout({

From dd2f2b0309a0163e53fc951a5823a128ad3fc216 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 5 Feb 2024 16:16:15 -0800
Subject: [PATCH 151/218] fix(langfuse.py): support logging failed llm api
 calls to langfuse

---
 litellm/integrations/langfuse.py | 198 +++++++++++++++++++------------
 litellm/utils.py                 |  58 ++++-----
 2 files changed, 151 insertions(+), 105 deletions(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index e62dccdc47..82de333660 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -55,8 +55,21 @@ class LangFuseLogger:
         else:
             self.upstream_langfuse = None
 
+    # def log_error(kwargs, response_obj, start_time, end_time):
+    #     generation = trace.generation(
+    #         level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
+    #         status_message='error' # can be any string (e.g. stringified stack trace or error body)
+    #     )
     def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self,
+        kwargs,
+        response_obj,
+        start_time,
+        end_time,
+        user_id,
+        print_verbose,
+        level="DEFAULT",
+        status_message=None,
     ):
         # Method definition
 
@@ -84,37 +97,49 @@ class LangFuseLogger:
                         pass
 
             # end of processing langfuse ########################
-            if kwargs.get("call_type", None) == "embedding" or isinstance(
-                response_obj, litellm.EmbeddingResponse
+            if (
+                level == "ERROR"
+                and status_message is not None
+                and isinstance(status_message, str)
+            ):
+                input = prompt
+                output = status_message
+            elif response_obj is not None and (
+                kwargs.get("call_type", None) == "embedding"
+                or isinstance(response_obj, litellm.EmbeddingResponse)
             ):
                 input = prompt
                 output = response_obj["data"]
-            else:
+            elif response_obj is not None:
                 input = prompt
                 output = response_obj["choices"][0]["message"].json()
-            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
-            self._log_langfuse_v2(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-                print_verbose,
-            ) if self._is_langfuse_v2() else self._log_langfuse_v1(
-                user_id,
-                metadata,
-                output,
-                start_time,
-                end_time,
-                kwargs,
-                optional_params,
-                input,
-                response_obj,
-            )
+            print(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            if self._is_langfuse_v2():
+                self._log_langfuse_v2(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                    level,
+                    print_verbose,
+                )
+            elif response_obj is not None:
+                self._log_langfuse_v1(
+                    user_id,
+                    metadata,
+                    output,
+                    start_time,
+                    end_time,
+                    kwargs,
+                    optional_params,
+                    input,
+                    response_obj,
+                )
 
             self.Langfuse.flush()
             print_verbose(
@@ -123,15 +148,15 @@ class LangFuseLogger:
             verbose_logger.info(f"Langfuse Layer Logging - logging success")
         except:
             traceback.print_exc()
-            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
             pass
 
     async def _async_log_event(
         self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
     ):
-        self.log_event(
-            kwargs, response_obj, start_time, end_time, user_id, print_verbose
-        )
+        """
+        TODO: support async calls when langfuse is truly async
+        """
 
     def _is_langfuse_v2(self):
         import langfuse
@@ -193,57 +218,78 @@ class LangFuseLogger:
         optional_params,
         input,
         response_obj,
+        level,
         print_verbose,
     ):
         import langfuse
 
-        tags = []
-        supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
-        supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
+        try:
+            tags = []
+            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
+            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
 
-        print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
+            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
 
-        generation_name = metadata.get("generation_name", None)
-        if generation_name is None:
-            # just log `litellm-{call_type}` as the generation name
-            generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
+            generation_name = metadata.get("generation_name", None)
+            if generation_name is None:
+                # just log `litellm-{call_type}` as the generation name
+                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
 
-        trace_params = {
-            "name": generation_name,
-            "input": input,
-            "output": output,
-            "user_id": metadata.get("trace_user_id", user_id),
-            "id": metadata.get("trace_id", None),
-            "session_id": metadata.get("session_id", None),
-        }
-        cost = kwargs["response_cost"]
-        print_verbose(f"trace: {cost}")
-        if supports_tags:
-            for key, value in metadata.items():
-                tags.append(f"{key}:{value}")
-            if "cache_hit" in kwargs:
-                tags.append(f"cache_hit:{kwargs['cache_hit']}")
-            trace_params.update({"tags": tags})
+            trace_params = {
+                "name": generation_name,
+                "input": input,
+                "user_id": metadata.get("trace_user_id", user_id),
+                "id": metadata.get("trace_id", None),
+                "session_id": metadata.get("session_id", None),
+            }
 
-        trace = self.Langfuse.trace(**trace_params)
+            if level == "ERROR":
+                trace_params["status_message"] = output
+            else:
+                trace_params["output"] = output
 
-        # get generation_id
-        generation_id = None
-        if response_obj.get("id", None) is not None:
-            generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-        trace.generation(
-            name=generation_name,
-            id=metadata.get("generation_id", generation_id),
-            startTime=start_time,
-            endTime=end_time,
-            model=kwargs["model"],
-            modelParameters=optional_params,
-            input=input,
-            output=output,
-            usage={
-                "prompt_tokens": response_obj["usage"]["prompt_tokens"],
-                "completion_tokens": response_obj["usage"]["completion_tokens"],
-                "total_cost": cost if supports_costs else None,
-            },
-            metadata=metadata,
-        )
+            cost = kwargs.get("response_cost", None)
+            print_verbose(f"trace: {cost}")
+            if supports_tags:
+                for key, value in metadata.items():
+                    tags.append(f"{key}:{value}")
+                if "cache_hit" in kwargs:
+                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
+                trace_params.update({"tags": tags})
+
+            trace = self.Langfuse.trace(**trace_params)
+
+            if level == "ERROR":
+                trace.generation(
+                    level="ERROR",  # can be any of DEBUG, DEFAULT, WARNING or ERROR
+                    status_message=output,  # can be any string (e.g. stringified stack trace or error body)
+                )
+                print(f"SUCCESSFULLY LOGGED ERROR")
+            else:
+                # get generation_id
+                generation_id = None
+                if (
+                    response_obj is not None
+                    and response_obj.get("id", None) is not None
+                ):
+                    generation_id = litellm.utils.get_logging_id(
+                        start_time, response_obj
+                    )
+                trace.generation(
+                    name=generation_name,
+                    id=metadata.get("generation_id", generation_id),
+                    startTime=start_time,
+                    endTime=end_time,
+                    model=kwargs["model"],
+                    modelParameters=optional_params,
+                    input=input,
+                    output=output,
+                    usage={
+                        "prompt_tokens": response_obj["usage"]["prompt_tokens"],
+                        "completion_tokens": response_obj["usage"]["completion_tokens"],
+                        "total_cost": cost if supports_costs else None,
+                    },
+                    metadata=metadata,
+                )
+        except Exception as e:
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
diff --git a/litellm/utils.py b/litellm/utils.py
index e56ba879f8..1e83a319f4 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1636,34 +1636,6 @@ class Logging:
                             end_time=end_time,
                             print_verbose=print_verbose,
                         )
-                if callback == "langfuse":
-                    global langFuseLogger
-                    print_verbose("reaches Async langfuse for logging!")
-                    kwargs = {}
-                    for k, v in self.model_call_details.items():
-                        if (
-                            k != "original_response"
-                        ):  # copy.deepcopy raises errors as this could be a coroutine
-                            kwargs[k] = v
-                    # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                    if self.stream:
-                        if "complete_streaming_response" not in kwargs:
-                            return
-                        else:
-                            print_verbose(
-                                "reaches Async langfuse for streaming logging!"
-                            )
-                            result = kwargs["complete_streaming_response"]
-                    if langFuseLogger is None:
-                        langFuseLogger = LangFuseLogger()
-                    await langFuseLogger._async_log_event(
-                        kwargs=kwargs,
-                        response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        user_id=kwargs.get("user", None),
-                        print_verbose=print_verbose,
-                    )
             except:
                 print_verbose(
                     f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
@@ -1788,9 +1760,37 @@ class Logging:
                             response_obj=result,
                             kwargs=self.model_call_details,
                         )
+                    elif callback == "langfuse":
+                        global langFuseLogger
+                        verbose_logger.debug("reaches langfuse for logging!")
+                        kwargs = {}
+                        for k, v in self.model_call_details.items():
+                            if (
+                                k != "original_response"
+                            ):  # copy.deepcopy raises errors as this could be a coroutine
+                                kwargs[k] = v
+                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
+                        if langFuseLogger is None or (
+                            self.langfuse_public_key != langFuseLogger.public_key
+                            and self.langfuse_secret != langFuseLogger.secret_key
+                        ):
+                            langFuseLogger = LangFuseLogger(
+                                langfuse_public_key=self.langfuse_public_key,
+                                langfuse_secret=self.langfuse_secret,
+                            )
+                        langFuseLogger.log_event(
+                            start_time=start_time,
+                            end_time=end_time,
+                            response_obj=None,
+                            user_id=kwargs.get("user", None),
+                            print_verbose=print_verbose,
+                            status_message=str(exception),
+                            level="ERROR",
+                            kwargs=self.model_call_details,
+                        )
                 except Exception as e:
                     print_verbose(
-                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
+                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {str(e)}"
                     )
                     print_verbose(
                         f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"

From 8f5f79f300d8626dcb7afe9f2a08dbc62d255278 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:37:05 -0800
Subject: [PATCH 152/218] (docs) upperbound_key_generate_params

---
 docs/my-website/docs/proxy/virtual_keys.md       | 16 ++++++++++++++++
 .../model_prices_and_context_window_backup.json  |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
index dd5edc6da8..c51bfc0ac9 100644
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@@ -352,6 +352,22 @@ Request Params:
 }
 ```
 
+## Upperbound /key/generate params
+Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
+
+Set `litellm_settings:upperbound_key_generate_params`:
+```yaml
+litellm_settings:
+  upperbound_key_generate_params:
+    max_budget: 100 # upperbound of $100, for all /key/generate requests
+    duration: "30d" # upperbound of 30 days for all /key/generate requests
+```
+
+** Expected Behavior **
+
+- Send a `/key/generate` request with `max_budget=200`
+- Key will be created with `max_budget=100` since 100 is the upper bound
+
 ## Default /key/generate params
 Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index b6ded001c9..4c28bdbe8b 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -156,8 +156,8 @@
         "max_tokens": 4097,
         "max_input_tokens": 4097,
         "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
-        "output_cost_per_token": 0.000016,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000006,
         "litellm_provider": "openai",
         "mode": "chat"
     },

From 865282f8d44f2197673dd15b3772af1a406db484 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:38:47 -0800
Subject: [PATCH 153/218] (feat) upperbound_key_generate_params

---
 litellm/__init__.py           |  1 +
 litellm/proxy/proxy_server.py | 69 +++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 3f2a1e4b4d..26b761c64a 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -146,6 +146,7 @@ suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 289a36cb2b..494c874147 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1391,6 +1391,26 @@ class ProxyConfig:
 proxy_config = ProxyConfig()
 
 
+def _duration_in_seconds(duration: str):
+    match = re.match(r"(\d+)([smhd]?)", duration)
+    if not match:
+        raise ValueError("Invalid duration format")
+
+    value, unit = match.groups()
+    value = int(value)
+
+    if unit == "s":
+        return value
+    elif unit == "m":
+        return value * 60
+    elif unit == "h":
+        return value * 3600
+    elif unit == "d":
+        return value * 86400
+    else:
+        raise ValueError("Unsupported duration unit")
+
+
 async def generate_key_helper_fn(
     duration: Optional[str],
     models: list,
@@ -1425,25 +1445,6 @@ async def generate_key_helper_fn(
     if token is None:
         token = f"sk-{secrets.token_urlsafe(16)}"
 
-    def _duration_in_seconds(duration: str):
-        match = re.match(r"(\d+)([smhd]?)", duration)
-        if not match:
-            raise ValueError("Invalid duration format")
-
-        value, unit = match.groups()
-        value = int(value)
-
-        if unit == "s":
-            return value
-        elif unit == "m":
-            return value * 60
-        elif unit == "h":
-            return value * 3600
-        elif unit == "d":
-            return value * 86400
-        else:
-            raise ValueError("Unsupported duration unit")
-
     if duration is None:  # allow tokens that never expire
         expires = None
     else:
@@ -2660,6 +2661,36 @@ async def generate_key_fn(
                 elif key == "metadata" and value == {}:
                     setattr(data, key, litellm.default_key_generate_params.get(key, {}))
 
+        # check if user set default key/generate params on config.yaml
+        if litellm.upperbound_key_generate_params is not None:
+            for elem in data:
+                # if key in litellm.upperbound_key_generate_params, use the min of value and litellm.upperbound_key_generate_params[key]
+                key, value = elem
+                if value is not None and key in litellm.upperbound_key_generate_params:
+                    # if value is float/int
+                    if key in [
+                        "max_budget",
+                        "max_parallel_requests",
+                        "tpm_limit",
+                        "rpm_limit",
+                    ]:
+                        if value > litellm.upperbound_key_generate_params[key]:
+                            # directly compare floats/ints
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+                    elif key == "budget_duration":
+                        # budgets are in 1s, 1m, 1h, 1d, 1m (30s, 30m, 30h, 30d, 30m)
+                        # compare the duration in seconds and max duration in seconds
+                        upperbound_budget_duration = _duration_in_seconds(
+                            duration=litellm.upperbound_key_generate_params[key]
+                        )
+                        user_set_budget_duration = _duration_in_seconds(duration=value)
+                        if user_set_budget_duration > upperbound_budget_duration:
+                            setattr(
+                                data, key, litellm.upperbound_key_generate_params[key]
+                            )
+
         data_json = data.json()  # type: ignore
 
         # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users

From 3e8e29a125677b5a44db6eb6774eed67af295a5d Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:39:36 -0800
Subject: [PATCH 154/218] (test) test_upperbound_key_params

---
 litellm/tests/test_key_generate_prisma.py | 34 +++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py
index de26168591..b4c86afb25 100644
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@@ -1279,6 +1279,40 @@ async def test_default_key_params(prisma_client):
         pytest.fail(f"Got exception {e}")
 
 
+@pytest.mark.asyncio()
+async def test_upperbound_key_params(prisma_client):
+    """
+    - create key
+    - get key info
+    - assert key_name is not null
+    """
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    litellm.upperbound_key_generate_params = {
+        "max_budget": 0.001,
+        "budget_duration": "1m",
+    }
+    await litellm.proxy.proxy_server.prisma_client.connect()
+    try:
+        request = GenerateKeyRequest(
+            max_budget=200000,
+            budget_duration="30d",
+        )
+        key = await generate_key_fn(request)
+        generated_key = key.key
+
+        result = await info_key_fn(key=generated_key)
+        key_info = result["info"]
+        # assert it used the upper bound for max_budget, and budget_duration
+        assert key_info["max_budget"] == 0.001
+        assert key_info["budget_duration"] == "1m"
+
+        print(result)
+    except Exception as e:
+        print("Got Exception", e)
+        pytest.fail(f"Got exception {e}")
+
+
 def test_get_bearer_token():
     from litellm.proxy.proxy_server import _get_bearer_token
 

From b0bcc4c6e3bea9d91841c9c47456ffc42dd3fb6b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:40:52 -0800
Subject: [PATCH 155/218] (feat) proxy - upperbound params /key/generate

---
 litellm/proxy/proxy_config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 874049a752..bd844bd7ba 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,6 +73,9 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
+  upperbound_key_generate_params:
+    max_budget: 100
+    duration: "30d"
   # cache: True     
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From 5cb583d50b9be5c8857fabcd57c5b9510c6881a7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:51:08 -0800
Subject: [PATCH 156/218] (fix) proxy startup test

---
 .../test_configs/test_config_no_auth.yaml     | 95 -------------------
 1 file changed, 95 deletions(-)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index 8441018e35..ccebe016db 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,21 +9,11 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
-- litellm_params:
-    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
-    api_key: os.environ/AZURE_API_KEY
-    model: azure/chatgpt-v-2
-  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -36,93 +26,8 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
-  model_name: test_openai_models
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    mode: embedding
-  model_name: azure-embedding-model
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: f6f74e14-ac64-4403-9365-319e584dcdc5
-  model_name: test_openai_models
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 9b1ef341-322c-410a-8992-903987fef439
-  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
-- litellm_params:
-    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
-  model_info:
-    mode: embedding
-  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
-- litellm_params:
-    model: dall-e-3
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_SWEDEN_API_BASE
-    api_key: os.environ/AZURE_SWEDEN_API_KEY
-    api_version: 2023-12-01-preview
-    model: azure/dall-e-3-test
-  model_info:
-    mode: image_generation
-  model_name: dall-e-3
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-06-01-preview
-    model: azure/
-  model_info:
-    mode: image_generation
-  model_name: dall-e-2
-- litellm_params:
-    api_base: os.environ/AZURE_API_BASE
-    api_key: os.environ/AZURE_API_KEY
-    api_version: 2023-07-01-preview
-    model: azure/azure-embedding-model
-  model_info:
-    base_model: text-embedding-ada-002
-    mode: embedding
-  model_name: text-embedding-ada-002
-- litellm_params:
-    model: gpt-3.5-turbo
-  model_info:
-    description: this is a test openai model
-    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
-  model_name: test_openai_models

From 57626a74d5d5f1f3d0fdd8fd51453e2a28468fe7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 22:53:31 -0800
Subject: [PATCH 157/218] (ci/cd) print debug info for
 test_proxy_gunicorn_startup_config_dict

---
 litellm/tests/test_proxy_startup.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_proxy_startup.py
index 650e2f8a7a..a846c9f4a3 100644
--- a/litellm/tests/test_proxy_startup.py
+++ b/litellm/tests/test_proxy_startup.py
@@ -33,6 +33,11 @@ def test_proxy_gunicorn_startup_direct_config():
     Test both approaches
     """
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -48,6 +53,11 @@ def test_proxy_gunicorn_startup_direct_config():
 
 def test_proxy_gunicorn_startup_config_dict():
     try:
+        from litellm._logging import verbose_proxy_logger, verbose_router_logger
+        import logging
+
+        verbose_proxy_logger.setLevel(level=logging.DEBUG)
+        verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"

From edd6483fa1904bc5ab9d192de3666f236fc766e1 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 06:46:49 -0800
Subject: [PATCH 158/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 34dc0e3b57..528bb19d2a 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -379,6 +379,7 @@ async def test_normal_router_tpm_limit():
         )
 
     except Exception as e:
+        print("Exception on test_normal_router_tpm_limit", e)
         assert e.status_code == 429
 
 

From 94e259903679ebd754125fa4480eb5be8cb15c5c Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:22:54 -0800
Subject: [PATCH 159/218] fix(ollama_chat.py): fix ollama chat completion token
 counting

---
 litellm/llms/ollama_chat.py | 8 ++++++--
 litellm/utils.py            | 3 ---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 95ff8dfaa3..3628ae2903 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -320,11 +320,15 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                 model_response["choices"][0]["message"] = message
             else:
                 model_response["choices"][0]["message"] = response_json["message"]
+
             model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
+            model_response["model"] = "ollama_chat/" + data["model"]
             prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
             completion_tokens = response_json.get(
-                "eval_count", litellm.token_counter(text=response_json["message"])
+                "eval_count",
+                litellm.token_counter(
+                    text=response_json["message"]["content"], count_response_tokens=True
+                ),
             )
             model_response["usage"] = litellm.Usage(
                 prompt_tokens=prompt_tokens,
diff --git a/litellm/utils.py b/litellm/utils.py
index 1e83a319f4..8491a1d5e1 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -983,9 +983,6 @@ class Logging:
             verbose_logger.debug(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
-            verbose_logger.debug(
-                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
-            )
             if self.logger_fn and callable(self.logger_fn):
                 try:
                     self.logger_fn(

From 946447c52fbff6d60faac40366515afda2f129df Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:26:13 -0800
Subject: [PATCH 160/218] fix(utils.py): use print_verbose for statements, so
 debug can be seen when running sdk

---
 litellm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index 8491a1d5e1..5ccb85ef05 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -980,7 +980,7 @@ class Logging:
             self.model_call_details["log_event_type"] = "post_api_call"
 
             # User Logging -> if you pass in a custom logging function
-            verbose_logger.debug(
+            print_verbose(
                 f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
             )
             if self.logger_fn and callable(self.logger_fn):

From 9fc4c590911588582b506fff158f4d3ee0a68369 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:43:47 -0800
Subject: [PATCH 161/218] fix(ollama_chat.py): explicitly state if ollama call
 is streaming or not

---
 litellm/llms/ollama_chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 3628ae2903..d1a439398b 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,7 +146,12 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "messages": messages, "options": optional_params}
+    data = {
+        "model": model,
+        "messages": messages,
+        "options": optional_params,
+        "stream": stream,
+    }
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From b8a2412634c52172e7611639406bffcb17b07686 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:44:04 -0800
Subject: [PATCH 162/218] =?UTF-8?q?bump:=20version=201.22.6=20=E2=86=92=20?=
 =?UTF-8?q?1.22.7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 06dedbed63..be8c8966be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.6"
+version = "1.22.7"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.6"
+version = "1.22.7"
 version_files = [
     "pyproject.toml:^version"
 ]

From 9028c51368196f043ba4a6859cd48d39218aba73 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 07:35:46 -0800
Subject: [PATCH 163/218] build(requirements.txt): update the proxy
 requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c9bd0e511d..768e8dff3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
-google-generativeai==0.1.0 # for vertex ai calls
+google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 traceloop-sdk==0.5.3 # for open telemetry logging
 langfuse>=2.6.3 # for langfuse self-hosted logging

From bfd9a30e98f717540044838e48c1e51141f6ef9f Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:11:43 -0800
Subject: [PATCH 164/218] fix(ollama.py): support format for ollama

---
 litellm/llms/ollama.py      | 10 +++++++++-
 litellm/llms/ollama_chat.py |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py
index d0bc24af4c..9339deb78d 100644
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@@ -146,7 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
-    data = {"model": model, "prompt": prompt, "options": optional_params}
+    format = optional_params.pop("format", None)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "options": optional_params,
+        "stream": stream,
+    }
+    if format is not None:
+        data["format"] = format
 
     ## LOGGING
     logging_obj.pre_call(
diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index d1a439398b..0311931b13 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -146,12 +146,15 @@ def get_ollama_response(
             optional_params[k] = v
 
     stream = optional_params.pop("stream", False)
+    format = optional_params.pop("format", None)
     data = {
         "model": model,
         "messages": messages,
         "options": optional_params,
         "stream": stream,
     }
+    if format is not None:
+        data["format"] = format
     ## LOGGING
     logging_obj.pre_call(
         input=None,

From f5f9e7005e317a0319784e28fbcb131253c171be Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 10:12:13 -0800
Subject: [PATCH 165/218] =?UTF-8?q?bump:=20version=201.22.7=20=E2=86=92=20?=
 =?UTF-8?q?1.22.8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be8c8966be..17d80ae8ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.7"
+version = "1.22.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.7"
+version = "1.22.8"
 version_files = [
     "pyproject.toml:^version"
 ]

From 15f5b5808f5d04bcaa06a9adcaec0f8d0c1e2580 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:57:20 -0800
Subject: [PATCH 166/218] (ci/cd) run in verbose mode

---
 .circleci/config.yml             | 2 +-
 litellm/tests/test_completion.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c1224159a1..9a29ed07ca 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From fdcf0d5625025587eea3dc1ea759cf494879d043 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:27:24 -0800
Subject: [PATCH 167/218] (fix) rename proxy startup test

---
 litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename litellm/tests/{test_proxy_startup.py => test_aproxy_startup.py} (100%)

diff --git a/litellm/tests/test_proxy_startup.py b/litellm/tests/test_aproxy_startup.py
similarity index 100%
rename from litellm/tests/test_proxy_startup.py
rename to litellm/tests/test_aproxy_startup.py

From 53dc9e74f67a7f693524f01c67a048bbf8e33906 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:38:57 -0800
Subject: [PATCH 168/218] (fix) proxy_startup test

---
 litellm/tests/test_aproxy_startup.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/litellm/tests/test_aproxy_startup.py b/litellm/tests/test_aproxy_startup.py
index a846c9f4a3..024d69b1ff 100644
--- a/litellm/tests/test_aproxy_startup.py
+++ b/litellm/tests/test_aproxy_startup.py
@@ -36,6 +36,11 @@ def test_proxy_gunicorn_startup_direct_config():
         from litellm._logging import verbose_proxy_logger, verbose_router_logger
         import logging
 
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
         filepath = os.path.dirname(os.path.abspath(__file__))
@@ -49,6 +54,10 @@ def test_proxy_gunicorn_startup_direct_config():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 def test_proxy_gunicorn_startup_config_dict():
@@ -58,6 +67,11 @@ def test_proxy_gunicorn_startup_config_dict():
 
         verbose_proxy_logger.setLevel(level=logging.DEBUG)
         verbose_router_logger.setLevel(level=logging.DEBUG)
+        # unset set DATABASE_URL in env for this test
+        # set prisma client to None
+        setattr(litellm.proxy.proxy_server, "prisma_client", None)
+        database_url = os.environ.pop("DATABASE_URL", None)
+
         filepath = os.path.dirname(os.path.abspath(__file__))
         # test with worker_config = config yaml
         config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@@ -71,6 +85,10 @@ def test_proxy_gunicorn_startup_config_dict():
             pass
         else:
             pytest.fail(f"An exception occurred - {str(e)}")
+    finally:
+        # restore DATABASE_URL after the test
+        if database_url is not None:
+            os.environ["DATABASE_URL"] = database_url
 
 
 # test_proxy_gunicorn_startup()

From d81aa90bfe278ffb82f991b8afd1ccdb1886eb8e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:16 -0800
Subject: [PATCH 169/218] (ci/cd) run pytest without -s

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9a29ed07ca..c1224159a1 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
           no_output_timeout: 120m
 
       # Store test results

From 4d28fcf6b6beeaec2c40cbfa01c05f0245f437ac Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:22:24 -0800
Subject: [PATCH 170/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 3c7a94cd89462b23cf30748eb70fb952fd5344c9 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:43:28 -0800
Subject: [PATCH 171/218] (fix) parallel_request_limiter debug

---
 litellm/proxy/hooks/parallel_request_limiter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index ca60421a50..48cf5b7799 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -130,7 +130,9 @@ class _PROXY_MaxParallelRequestsHandler(CustomLogger):
                 "current_rpm": current["current_rpm"] + 1,
             }
 
-            self.print_verbose(f"updated_value in success call: {new_val}")
+            self.print_verbose(
+                f"updated_value in success call: {new_val}, precise_minute: {precise_minute}"
+            )
             self.user_api_key_cache.set_cache(
                 request_count_api_key, new_val, ttl=60
             )  # store in cache for 1 min.

From cfa97fb0c64d0d47ed905951d54b46a6e26ad884 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:44:30 -0800
Subject: [PATCH 172/218] (fix) test_normal_router_tpm_limit

---
 litellm/tests/test_parallel_request_limiter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/litellm/tests/test_parallel_request_limiter.py b/litellm/tests/test_parallel_request_limiter.py
index 528bb19d2a..bfac8ddeae 100644
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@@ -306,6 +306,10 @@ async def test_normal_router_call():
 
 @pytest.mark.asyncio
 async def test_normal_router_tpm_limit():
+    from litellm._logging import verbose_proxy_logger
+    import logging
+
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)
     model_list = [
         {
             "model_name": "azure-model",
@@ -353,6 +357,7 @@ async def test_normal_router_tpm_limit():
     current_minute = datetime.now().strftime("%M")
     precise_minute = f"{current_date}-{current_hour}-{current_minute}"
     request_count_api_key = f"{_api_key}::{precise_minute}::request_count"
+    print("Test: Checking current_requests for precise_minute=", precise_minute)
 
     assert (
         parallel_request_handler.user_api_key_cache.get_cache(
@@ -366,6 +371,7 @@ async def test_normal_router_tpm_limit():
         model="azure-model",
         messages=[{"role": "user", "content": "Write me a paragraph on the moon"}],
         metadata={"user_api_key": _api_key},
+        mock_response="hello",
     )
     await asyncio.sleep(1)  # success is done in a separate thread
     print(f"response: {response}")

From fbe52cf65382bddaac7192196cf2ad0a7de0f5f5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:47:19 -0800
Subject: [PATCH 173/218] (ci/cd) fix test_config_no_auth

---
 .../test_configs/test_config_no_auth.yaml     | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/litellm/tests/test_configs/test_config_no_auth.yaml b/litellm/tests/test_configs/test_config_no_auth.yaml
index ccebe016db..9d7aff5702 100644
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@@ -9,11 +9,21 @@ model_list:
     api_key: os.environ/AZURE_CANADA_API_KEY
     model: azure/gpt-35-turbo
   model_name: azure-model
+- litellm_params:
+    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
+    api_key: os.environ/AZURE_API_KEY
+    model: azure/chatgpt-v-2
+  model_name: azure-cloudflare-model
 - litellm_params:
     api_base: https://openai-france-1234.openai.azure.com
     api_key: os.environ/AZURE_FRANCE_API_KEY
     model: azure/gpt-turbo
   model_name: azure-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+  model_name: test_openai_models
 - litellm_params:
     model: gpt-3.5-turbo
   model_info:
@@ -26,8 +36,93 @@ model_list:
     description: this is a test openai model
     id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
   model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
+  model_name: test_openai_models
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    mode: embedding
+  model_name: azure-embedding-model
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: f6f74e14-ac64-4403-9365-319e584dcdc5
+  model_name: test_openai_models
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 9b1ef341-322c-410a-8992-903987fef439
+  model_name: test_openai_models
 - litellm_params:
     model: bedrock/amazon.titan-embed-text-v1
   model_info:
     mode: embedding
   model_name: amazon-embeddings
+- litellm_params:
+    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
+  model_info:
+    mode: embedding
+  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
+- litellm_params:
+    model: dall-e-3
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_SWEDEN_API_BASE
+    api_key: os.environ/AZURE_SWEDEN_API_KEY
+    api_version: 2023-12-01-preview
+    model: azure/dall-e-3-test
+  model_info:
+    mode: image_generation
+  model_name: dall-e-3
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-06-01-preview
+    model: azure/
+  model_info:
+    mode: image_generation
+  model_name: dall-e-2
+- litellm_params:
+    api_base: os.environ/AZURE_API_BASE
+    api_key: os.environ/AZURE_API_KEY
+    api_version: 2023-07-01-preview
+    model: azure/azure-embedding-model
+  model_info:
+    base_model: text-embedding-ada-002
+    mode: embedding
+  model_name: text-embedding-ada-002
+- litellm_params:
+    model: gpt-3.5-turbo
+  model_info:
+    description: this is a test openai model
+    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
+  model_name: test_openai_models
\ No newline at end of file

From 6db179dc88e0b87a429e4d300c33197d9fe44f0a Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 12:53:47 -0800
Subject: [PATCH 174/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..e0ee05d4f4 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the, response
+        # Add any assertions here to check the,response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From f19897643f56360b25b18d2a8a08283683d69b65 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:02:36 -0800
Subject: [PATCH 175/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index e0ee05d4f4..bd0301f204 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -41,7 +41,7 @@ def test_completion_custom_provider_model_name():
             messages=messages,
             logger_fn=logger_fn,
         )
-        # Add any assertions here to check the,response
+        # Add any assertions here to check the, response
         print(response)
         print(response["choices"][0]["finish_reason"])
     except litellm.Timeout as e:

From 84b0a8eb903d1c2358b6875a4e9e87d7fd2dcaf0 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:29 -0800
Subject: [PATCH 176/218] fix(utils.py): round max tokens to be int always

---
 litellm/tests/test_completion.py | 5 +++--
 litellm/utils.py                 | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index bd0301f204..de79c97afa 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -544,13 +544,13 @@ def hf_test_completion_tgi():
 def test_completion_openai():
     try:
         litellm.set_verbose = True
+        litellm.drop_params = True
         print(f"api key: {os.environ['OPENAI_API_KEY']}")
         litellm.api_key = os.environ["OPENAI_API_KEY"]
         response = completion(
             model="gpt-3.5-turbo",
-            messages=messages,
+            messages=[{"role": "user", "content": "Hey"}],
             max_tokens=10,
-            request_timeout=1,
             metadata={"hi": "bye"},
         )
         print("This is the response object\n", response)
@@ -565,6 +565,7 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
+        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:
diff --git a/litellm/utils.py b/litellm/utils.py
index 5ccb85ef05..fdca57e51f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2348,7 +2348,9 @@ def client(original_function):
                     elif user_max_tokens + input_tokens > max_output_tokens:
                         user_max_tokens = max_output_tokens - input_tokens
                     print_verbose(f"user_max_tokens: {user_max_tokens}")
-                    kwargs["max_tokens"] = user_max_tokens
+                    kwargs["max_tokens"] = int(
+                        round(user_max_tokens)
+                    )  # make sure max tokens is always an int
                 except Exception as e:
                     print_verbose(f"Error while checking max token limit: {str(e)}")
             # MODEL CALL

From 9d8734ef32ce1fcfd075c3658e9cca31a66c301b Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:10:49 -0800
Subject: [PATCH 177/218] =?UTF-8?q?bump:=20version=201.22.8=20=E2=86=92=20?=
 =?UTF-8?q?1.22.9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 17d80ae8ee..944aad7f8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.8"
+version = "1.22.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.8"
+version = "1.22.9"
 version_files = [
     "pyproject.toml:^version"
 ]

From 122867370f4568ab3f79ec00da1a8a7600dcc516 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:09:48 -0800
Subject: [PATCH 178/218] (feat) show langfuse logging tags better through
 proxy

---
 litellm/integrations/langfuse.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 82de333660..3c3e793dfb 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -252,8 +252,14 @@ class LangFuseLogger:
             print_verbose(f"trace: {cost}")
             if supports_tags:
                 for key, value in metadata.items():
-                    tags.append(f"{key}:{value}")
+                    if key in [
+                        "user_api_key",
+                        "user_api_key_user_id",
+                    ]:
+                        tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:
+                    if kwargs["cache_hit"] is None:
+                        kwargs["cache_hit"] = False
                     tags.append(f"cache_hit:{kwargs['cache_hit']}")
                 trace_params.update({"tags": tags})
 

From ff985b0f63d4e5b2b182a0cd45bacbee0a5138c1 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 12:28:21 -0800
Subject: [PATCH 179/218] (feat )add semantic cache

---
 litellm/caching.py            | 102 +++++++++++++++++++++++++++++++++-
 litellm/tests/test_caching.py |  25 +++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index d0721fe9a9..e1ef95dc34 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -83,7 +83,6 @@ class InMemoryCache(BaseCache):
         self.cache_dict.clear()
         self.ttl_dict.clear()
 
-
     async def disconnect(self):
         pass
 
@@ -217,7 +216,6 @@ class RedisCache(BaseCache):
     def flush_cache(self):
         self.redis_client.flushall()
 
-
     async def disconnect(self):
         pass
 
@@ -225,6 +223,102 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
+class RedisSemanticCache(RedisCache):
+    def __init__(self, host, port, password, **kwargs):
+        super().__init__()
+
+        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
+        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
+        # from redis.commands.search.query import Query
+
+        # INDEX_NAME = 'idx:litellm_completion_response_vss'
+        # DOC_PREFIX = 'bikes:'
+
+        # try:
+        #     # check to see if index exists
+        #     client.ft(INDEX_NAME).info()
+        #     print('Index already exists!')
+        # except:
+        #     # schema
+        #     schema = (
+        #         TextField('$.model', no_stem=True, as_name='model'),
+        #         TextField('$.brand', no_stem=True, as_name='brand'),
+        #         NumericField('$.price', as_name='price'),
+        #         TagField('$.type', as_name='type'),
+        #         TextField('$.description', as_name='description'),
+        #         VectorField('$.description_embeddings',
+        #             'FLAT', {
+        #                 'TYPE': 'FLOAT32',
+        #                 'DIM': VECTOR_DIMENSION,
+        #                 'DISTANCE_METRIC': 'COSINE',
+        #             },  as_name='vector'
+        #         ),
+        #     )
+
+        #     # index Definition
+        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
+
+        #     # create Index
+        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+
+    def set_cache(self, key, value, **kwargs):
+        ttl = kwargs.get("ttl", None)
+        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            # get text response
+            # print("in redis semantic cache: value: ", value)
+            llm_response = value["response"]
+
+            # if llm_response is a string, convert it to a dictionary
+            if isinstance(llm_response, str):
+                llm_response = json.loads(llm_response)
+
+            # print("converted llm_response: ", llm_response)
+            response = llm_response["choices"][0]["message"]["content"]
+
+            # create embedding response
+
+            embedding_response = litellm.embedding(
+                model="text-embedding-ada-002",
+                input=response,
+                cache={"no-store": True},
+            )
+
+            raw_embedding = embedding_response["data"][0]["embedding"]
+            raw_embedding_dimension = len(raw_embedding)
+
+            # print("embedding: ", raw_embedding)
+            key = "litellm-semantic:" + key
+            self.redis_client.json().set(
+                name=key,
+                path="$",
+                obj=json.dumps(
+                    {
+                        "response": response,
+                        "embedding": raw_embedding,
+                        "dimension": raw_embedding_dimension,
+                    }
+                ),
+            )
+
+            stored_redis_value = self.redis_client.json().get(name=key)
+
+            # print("Stored Redis Value: ", stored_redis_value)
+
+        except Exception as e:
+            # print("Error occurred: ", e)
+            # NON blocking - notify users Redis is throwing an exception
+            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+
+    def get_cache(self, key, **kwargs):
+        pass
+
+    async def async_set_cache(self, key, value, **kwargs):
+        pass
+
+    async def async_get_cache(self, key, **kwargs):
+        pass
+
 
 class S3Cache(BaseCache):
     def __init__(
@@ -429,7 +523,7 @@ class DualCache(BaseCache):
 class Cache:
     def __init__(
         self,
-        type: Optional[Literal["local", "redis", "s3"]] = "local",
+        type: Optional[Literal["local", "redis", "redis-semantic", "s3"]] = "local",
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
@@ -468,6 +562,8 @@ class Cache:
         """
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
+        elif type == "redis-semantic":
+            self.cache = RedisSemanticCache(host, port, password, **kwargs)
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 468ab6f80f..32904ab784 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -987,3 +987,28 @@ def test_cache_context_managers():
 
 
 # test_cache_context_managers()
+
+
+def test_redis_semantic_cache_completion():
+    litellm.set_verbose = False
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+    messages = [
+        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
+    ]
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
+    print("test2 for Redis Caching - non streaming")
+    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
+    # response2 = completion(
+    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
+    # )
+
+
+# test_redis_cache_completion()

From 1e1721fc6e093a6d6eb6850efcff6c12e918f109 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:12 -0800
Subject: [PATCH 180/218] (feat) working - sync semantic caching

---
 litellm/caching.py | 227 ++++++++++++++++++++++++++++++---------------
 1 file changed, 152 insertions(+), 75 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index e1ef95dc34..0a1046f0d8 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -223,94 +223,161 @@ class RedisCache(BaseCache):
         self.redis_client.delete(key)
 
 
-class RedisSemanticCache(RedisCache):
-    def __init__(self, host, port, password, **kwargs):
-        super().__init__()
+class RedisSemanticCache(BaseCache):
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_url=None,
+        similarity_threshold=None,
+        **kwargs,
+    ):
+        from redisvl.index import SearchIndex
+        from redisvl.query import VectorQuery
 
-        # from redis.commands.search.field import TagField, TextField, NumericField, VectorField
-        # from redis.commands.search.indexDefinition import IndexDefinition, IndexType
-        # from redis.commands.search.query import Query
+        print_verbose(
+            "redis semantic-cache initializing INDEX - litellm_semantic_cache_index"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        schema = {
+            "index": {
+                "name": "litellm_semantic_cache_index",
+                "prefix": "litellm",
+                "storage_type": "hash",
+            },
+            "fields": {
+                "text": [{"name": "response"}],
+                "text": [{"name": "prompt"}],
+                "vector": [
+                    {
+                        "name": "litellm_embedding",
+                        "dims": 1536,
+                        "distance_metric": "cosine",
+                        "algorithm": "flat",
+                        "datatype": "float32",
+                    }
+                ],
+            },
+        }
+        self.index = SearchIndex.from_dict(schema)
+        if redis_url is None:
+            # if no url passed, check if host, port and password are passed, if not raise an Exception
+            if host is None or port is None or password is None:
+                raise Exception(f"Redis host, port, and password must be provided")
+            redis_url = "redis://:" + password + "@" + host + ":" + port
+        print_verbose(f"redis semantic-cache redis_url: {redis_url}")
+        self.index.connect(redis_url=redis_url)
+        self.index.create(overwrite=False)  # don't overwrite existing index
 
-        # INDEX_NAME = 'idx:litellm_completion_response_vss'
-        # DOC_PREFIX = 'bikes:'
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
 
-        # try:
-        #     # check to see if index exists
-        #     client.ft(INDEX_NAME).info()
-        #     print('Index already exists!')
-        # except:
-        #     # schema
-        #     schema = (
-        #         TextField('$.model', no_stem=True, as_name='model'),
-        #         TextField('$.brand', no_stem=True, as_name='brand'),
-        #         NumericField('$.price', as_name='price'),
-        #         TagField('$.type', as_name='type'),
-        #         TextField('$.description', as_name='description'),
-        #         VectorField('$.description_embeddings',
-        #             'FLAT', {
-        #                 'TYPE': 'FLOAT32',
-        #                 'DIM': VECTOR_DIMENSION,
-        #                 'DISTANCE_METRIC': 'COSINE',
-        #             },  as_name='vector'
-        #         ),
-        #     )
+        # check if cached_response is bytes
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
 
-        #     # index Definition
-        #     definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.JSON)
-
-        #     # create Index
-        #     client.ft(INDEX_NAME).create_index(fields=schema, definition=definition)
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
 
     def set_cache(self, key, value, **kwargs):
-        ttl = kwargs.get("ttl", None)
-        print_verbose(f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
-        try:
-            # get text response
-            # print("in redis semantic cache: value: ", value)
-            llm_response = value["response"]
+        import numpy as np
 
-            # if llm_response is a string, convert it to a dictionary
-            if isinstance(llm_response, str):
-                llm_response = json.loads(llm_response)
+        print_verbose(f"redis semantic-cache set_cache, kwargs: {kwargs}")
 
-            # print("converted llm_response: ", llm_response)
-            response = llm_response["choices"][0]["message"]["content"]
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
 
-            # create embedding response
+        # create an embedding for prompt
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
 
-            embedding_response = litellm.embedding(
-                model="text-embedding-ada-002",
-                input=response,
-                cache={"no-store": True},
-            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
 
-            raw_embedding = embedding_response["data"][0]["embedding"]
-            raw_embedding_dimension = len(raw_embedding)
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
 
-            # print("embedding: ", raw_embedding)
-            key = "litellm-semantic:" + key
-            self.redis_client.json().set(
-                name=key,
-                path="$",
-                obj=json.dumps(
-                    {
-                        "response": response,
-                        "embedding": raw_embedding,
-                        "dimension": raw_embedding_dimension,
-                    }
-                ),
-            )
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
 
-            stored_redis_value = self.redis_client.json().get(name=key)
+        # Add more data
+        keys = self.index.load(new_data)
 
-            # print("Stored Redis Value: ", stored_redis_value)
-
-        except Exception as e:
-            # print("Error occurred: ", e)
-            # NON blocking - notify users Redis is throwing an exception
-            logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e)
+        pass
 
     def get_cache(self, key, **kwargs):
+        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = litellm.embedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+            num_results=1,
+        )
+
+        results = self.index.query(query)
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
@@ -527,6 +594,7 @@ class Cache:
         host: Optional[str] = None,
         port: Optional[str] = None,
         password: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
         supported_call_types: Optional[
             List[Literal["completion", "acompletion", "embedding", "aembedding"]]
         ] = ["completion", "acompletion", "embedding", "aembedding"],
@@ -547,10 +615,12 @@ class Cache:
         Initializes the cache based on the given type.
 
         Args:
-            type (str, optional): The type of cache to initialize. Can be "local" or "redis". Defaults to "local".
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", or "s3". Defaults to "local".
             host (str, optional): The host address for the Redis cache. Required if type is "redis".
             port (int, optional): The port number for the Redis cache. Required if type is "redis".
             password (str, optional): The password for the Redis cache. Required if type is "redis".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic"
+
             supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
             **kwargs: Additional keyword arguments for redis.Redis() cache
 
@@ -563,7 +633,13 @@ class Cache:
         if type == "redis":
             self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
         elif type == "redis-semantic":
-            self.cache = RedisSemanticCache(host, port, password, **kwargs)
+            self.cache = RedisSemanticCache(
+                host,
+                port,
+                password,
+                similarity_threshold=similarity_threshold,
+                **kwargs,
+            )
         elif type == "local":
             self.cache = InMemoryCache()
         elif type == "s3":
@@ -743,6 +819,7 @@ class Cache:
             The cached result if it exists, otherwise None.
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -752,7 +829,7 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = self.cache.get_cache(cache_key)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From 01443f91d7c5df8b3b46f554586bca7805a0a672 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 17:58:32 -0800
Subject: [PATCH 181/218] (test) semantic cache

---
 litellm/tests/test_caching.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 32904ab784..3ac812cf35 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -990,7 +990,7 @@ def test_cache_context_managers():
 
 
 def test_redis_semantic_cache_completion():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
 
     random_number = random.randint(
         1, 100000
@@ -1003,6 +1003,7 @@ def test_redis_semantic_cache_completion():
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.5,
     )
     print("test2 for Redis Caching - non streaming")
     response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)

From 9ce95fe5f18babb1fa6103f4fbb9c605ce63a498 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:22:50 -0800
Subject: [PATCH 182/218] (test) semantic caching

---
 litellm/tests/test_caching.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 3ac812cf35..4b47614cca 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -995,21 +995,29 @@ def test_redis_semantic_cache_completion():
     random_number = random.randint(
         1, 100000
     )  # add a random number to ensure it's always adding / reading from cache
-    messages = [
-        {"role": "user", "content": f"write a one sentence poem about: {random_number}"}
-    ]
+
+    print("testing semantic caching")
     litellm.cache = Cache(
         type="redis-semantic",
         host=os.environ["REDIS_HOST"],
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
-        similarity_threshold=0.5,
+        similarity_threshold=0.8,
     )
-    print("test2 for Redis Caching - non streaming")
-    response1 = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=20)
-    # response2 = completion(
-    #     model="gpt-3.5-turbo", messages=messages,max_tokens=20
-    # )
+    response1 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
+    # assert response1.choices[0].message == 1
 
 
 # test_redis_cache_completion()

From aaed4d1e02dedcd3e96904ba40b86e550db98b1c Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Mon, 5 Feb 2024 18:25:22 -0800
Subject: [PATCH 183/218] (fix) semantic cache

---
 litellm/caching.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 0a1046f0d8..877f935fab 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -270,7 +270,10 @@ class RedisSemanticCache(BaseCache):
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         self.index.connect(redis_url=redis_url)
-        self.index.create(overwrite=False)  # don't overwrite existing index
+        try:
+            self.index.create(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
     def _get_cache_logic(self, cached_response: Any):
         """

From d8e9bccdf1cd8e26ce7c59672e4af6a3c85548b5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:13:12 -0800
Subject: [PATCH 184/218] (feat) RedisSemanticCache - async

---
 litellm/caching.py | 112 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 6 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 877f935fab..ad37f2077c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -231,6 +231,7 @@ class RedisSemanticCache(BaseCache):
         password=None,
         redis_url=None,
         similarity_threshold=None,
+        use_async=False,
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -262,14 +263,19 @@ class RedisSemanticCache(BaseCache):
                 ],
             },
         }
-        self.index = SearchIndex.from_dict(schema)
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
                 raise Exception(f"Redis host, port, and password must be provided")
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
-        self.index.connect(redis_url=redis_url)
+        if use_async == False:
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url)
+        elif use_async == True:
+            schema["index"]["name"] = "litellm_semantic_cache_index_async"
+            self.index = SearchIndex.from_dict(schema)
+            self.index.connect(redis_url=redis_url, use_async=True)
         try:
             self.index.create(overwrite=False)  # don't overwrite existing index
         except Exception as e:
@@ -327,10 +333,10 @@ class RedisSemanticCache(BaseCache):
         # Add more data
         keys = self.index.load(new_data)
 
-        pass
+        return
 
     def get_cache(self, key, **kwargs):
-        print_verbose(f"redis semantic-cache get_cache, kwargs: {kwargs}")
+        print_verbose(f"sync redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
 
@@ -360,6 +366,11 @@ class RedisSemanticCache(BaseCache):
         )
 
         results = self.index.query(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
 
         vector_distance = results[0]["vector_distance"]
         vector_distance = float(vector_distance)
@@ -384,9 +395,93 @@ class RedisSemanticCache(BaseCache):
         pass
 
     async def async_set_cache(self, key, value, **kwargs):
-        pass
+        import numpy as np
+
+        print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
+
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        # make the embedding a numpy array, convert to bytes
+        embedding_bytes = np.array(embedding, dtype=np.float32).tobytes()
+        value = str(value)
+        assert isinstance(value, str)
+
+        new_data = [
+            {"response": value, "prompt": prompt, "litellm_embedding": embedding_bytes}
+        ]
+
+        # Add more data
+        keys = await self.index.aload(new_data)
+        return
 
     async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
+        from redisvl.query import VectorQuery
+        import numpy as np
+
+        # query
+
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+
+        # convert to embedding
+        embedding_response = await litellm.aembedding(
+            model="text-embedding-ada-002",
+            input=prompt,
+            cache={"no-store": True, "no-cache": True},
+        )
+
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+
+        query = VectorQuery(
+            vector=embedding,
+            vector_field_name="litellm_embedding",
+            return_fields=["response", "prompt", "vector_distance"],
+        )
+        results = await self.index.aquery(query)
+        if results == None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+
+        vector_distance = results[0]["vector_distance"]
+        vector_distance = float(vector_distance)
+        similarity = 1 - vector_distance
+        cached_prompt = results[0]["prompt"]
+
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity > self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
         pass
 
 
@@ -612,6 +707,7 @@ class Cache:
         s3_aws_secret_access_key: Optional[str] = None,
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
+        redis_semantic_cache_use_async=False,
         **kwargs,
     ):
         """
@@ -641,6 +737,7 @@ class Cache:
                 port,
                 password,
                 similarity_threshold=similarity_threshold,
+                use_async=redis_semantic_cache_use_async,
                 **kwargs,
             )
         elif type == "local":
@@ -847,6 +944,7 @@ class Cache:
         Used for embedding calls in async wrapper
         """
         try:  # never block execution
+            messages = kwargs.get("messages", [])
             if "cache_key" in kwargs:
                 cache_key = kwargs["cache_key"]
             else:
@@ -856,7 +954,9 @@ class Cache:
                 max_age = cache_control_args.get(
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
-                cached_result = await self.cache.async_get_cache(cache_key)
+                cached_result = await self.cache.async_get_cache(
+                    cache_key, messages=messages
+                )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age
                 )

From 3f0740b48545452a7fa87af2ed320537dc26a5e7 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:14:54 -0800
Subject: [PATCH 185/218] (test) async semantic cache

---
 litellm/tests/test_caching.py | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 4b47614cca..a1a42ff659 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -991,6 +991,9 @@ def test_cache_context_managers():
 
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
 
     random_number = random.randint(
         1, 100000
@@ -1021,3 +1024,38 @@ def test_redis_semantic_cache_completion():
 
 
 # test_redis_cache_completion()
+
+
+@pytest.mark.asyncio
+async def test_redis_semantic_cache_acompletion():
+    litellm.set_verbose = True
+    import logging
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    random_number = random.randint(
+        1, 100000
+    )  # add a random number to ensure it's always adding / reading from cache
+
+    print("testing semantic caching")
+    litellm.cache = Cache(
+        type="redis-semantic",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+        similarity_threshold=0.8,
+        redis_semantic_cache_use_async=True,
+    )
+    response1 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response1: {response1}")
+
+    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"

From d184a388e903fc2aaa7d2e48ec9b8c13a2b75709 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:52:57 -0800
Subject: [PATCH 186/218] (feat) working semantic-cache on litellm proxy

---
 litellm/caching.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index ad37f2077c..a7958d074c 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -266,21 +266,30 @@ class RedisSemanticCache(BaseCache):
         if redis_url is None:
             # if no url passed, check if host, port and password are passed, if not raise an Exception
             if host is None or port is None or password is None:
-                raise Exception(f"Redis host, port, and password must be provided")
+                # try checking env for host, port and password
+                import os
+
+                host = os.getenv("REDIS_HOST")
+                port = os.getenv("REDIS_PORT")
+                password = os.getenv("REDIS_PASSWORD")
+                if host is None or port is None or password is None:
+                    raise Exception("Redis host, port, and password must be provided")
+
             redis_url = "redis://:" + password + "@" + host + ":" + port
         print_verbose(f"redis semantic-cache redis_url: {redis_url}")
         if use_async == False:
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url)
+            try:
+                self.index.create(overwrite=False)  # don't overwrite existing index
+            except Exception as e:
+                print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         elif use_async == True:
             schema["index"]["name"] = "litellm_semantic_cache_index_async"
             self.index = SearchIndex.from_dict(schema)
             self.index.connect(redis_url=redis_url, use_async=True)
-        try:
-            self.index.create(overwrite=False)  # don't overwrite existing index
-        except Exception as e:
-            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
 
+    #
     def _get_cache_logic(self, cached_response: Any):
         """
         Common 'get_cache_logic' across sync + async redis client implementations
@@ -397,6 +406,10 @@ class RedisSemanticCache(BaseCache):
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
 
+        try:
+            await self.index.acreate(overwrite=False)  # don't overwrite existing index
+        except Exception as e:
+            print_verbose(f"Got exception creating semantic cache index: {str(e)}")
         print_verbose(f"async redis semantic-cache set_cache, kwargs: {kwargs}")
 
         # get the prompt

From 9547ec321dba4e1e9e35556c105ee6836ceafebf Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:54:36 -0800
Subject: [PATCH 187/218] (feat) redis-semantic cache

---
 litellm/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/litellm/utils.py b/litellm/utils.py
index fdca57e51f..c25572c03c 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -55,7 +55,7 @@ from .integrations.litedebugger import LiteDebugger
 from .proxy._types import KeyManagementSystem
 from openai import OpenAIError as OriginalError
 from openai._models import BaseModel as OpenAIObject
-from .caching import S3Cache
+from .caching import S3Cache, RedisSemanticCache
 from .exceptions import (
     AuthenticationError,
     BadRequestError,
@@ -2533,6 +2533,14 @@ def client(original_function):
                         ):
                             if len(cached_result) == 1 and cached_result[0] is None:
                                 cached_result = None
+                    elif isinstance(litellm.cache.cache, RedisSemanticCache):
+                        preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
+                        kwargs[
+                            "preset_cache_key"
+                        ] = preset_cache_key  # for streaming calls, we need to pass the preset_cache_key
+                        cached_result = await litellm.cache.async_get_cache(
+                            *args, **kwargs
+                        )
                     else:
                         preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs)
                         kwargs[

From 673b99ca4fa7a6902aa09d4244d7aec8c36aa7eb Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 08:55:25 -0800
Subject: [PATCH 188/218] (feat) working semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index bd844bd7ba..41c3b41828 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -73,10 +73,12 @@ litellm_settings:
     max_budget: 1.5000
     models: ["azure-gpt-3.5"]
     duration: None
-  upperbound_key_generate_params:
-    max_budget: 100
-    duration: "30d"
-  # cache: True     
+  cache: True          # set cache responses to True
+  cache_params:
+    type: "redis-semantic"
+    similarity_threshold: 0.8
+    redis_semantic_cache_use_async: True
+  # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
 

From cf0f4ae93979ccfe6d6e2e89ed3037f22b580cbc Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:30:45 -0800
Subject: [PATCH 189/218] (fix) add redisvl==0.0.7

---
 .circleci/requirements.txt | 3 ++-
 requirements.txt           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt
index 85b576bff2..4730fc28b1 100644
--- a/.circleci/requirements.txt
+++ b/.circleci/requirements.txt
@@ -10,4 +10,5 @@ anthropic
 boto3
 orjson
 pydantic
-google-cloud-aiplatform
\ No newline at end of file
+google-cloud-aiplatform
+redisvl==0.0.7 # semantic caching
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 768e8dff3f..b0a49553d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
+redisvl==0.0.7 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From 104428605e5e8c9b1e245c0abc618d088ea48e41 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 09:31:57 -0800
Subject: [PATCH 190/218] (feat) log semantic_sim to langfuse

---
 litellm/caching.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index a7958d074c..133d1db6dd 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -471,9 +471,11 @@ class RedisSemanticCache(BaseCache):
         )
         results = await self.index.aquery(query)
         if results == None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
             return None
         if isinstance(results, list):
             if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
                 return None
 
         vector_distance = results[0]["vector_distance"]
@@ -485,6 +487,10 @@ class RedisSemanticCache(BaseCache):
         print_verbose(
             f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
         )
+
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+
         if similarity > self.similarity_threshold:
             # cache hit !
             cached_value = results[0]["response"]
@@ -968,7 +974,7 @@ class Cache:
                     "s-max-age", cache_control_args.get("s-maxage", float("inf"))
                 )
                 cached_result = await self.cache.async_get_cache(
-                    cache_key, messages=messages
+                    cache_key, *args, **kwargs
                 )
                 return self._get_cache_logic(
                     cached_result=cached_result, max_age=max_age

From 388bef1454b0b3c575893384da18c0804ae42451 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:22:02 -0800
Subject: [PATCH 191/218] allow setting redis_semantic cache_embedding model

---
 litellm/caching.py | 54 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/litellm/caching.py b/litellm/caching.py
index 133d1db6dd..6bf53ea451 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -232,6 +232,7 @@ class RedisSemanticCache(BaseCache):
         redis_url=None,
         similarity_threshold=None,
         use_async=False,
+        embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         from redisvl.index import SearchIndex
@@ -243,6 +244,7 @@ class RedisSemanticCache(BaseCache):
         if similarity_threshold is None:
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
         schema = {
             "index": {
                 "name": "litellm_semantic_cache_index",
@@ -322,7 +324,7 @@ class RedisSemanticCache(BaseCache):
 
         # create an embedding for prompt
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -359,7 +361,7 @@ class RedisSemanticCache(BaseCache):
 
         # convert to embedding
         embedding_response = litellm.embedding(
-            model="text-embedding-ada-002",
+            model=self.embedding_model,
             input=prompt,
             cache={"no-store": True, "no-cache": True},
         )
@@ -405,6 +407,7 @@ class RedisSemanticCache(BaseCache):
 
     async def async_set_cache(self, key, value, **kwargs):
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         try:
             await self.index.acreate(overwrite=False)  # don't overwrite existing index
@@ -418,12 +421,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
         # create an embedding for prompt
-
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -445,6 +460,7 @@ class RedisSemanticCache(BaseCache):
         print_verbose(f"async redis semantic-cache get_cache, kwargs: {kwargs}")
         from redisvl.query import VectorQuery
         import numpy as np
+        from litellm.proxy.proxy_server import llm_router, llm_model_list
 
         # query
 
@@ -454,12 +470,24 @@ class RedisSemanticCache(BaseCache):
         for message in messages:
             prompt += message["content"]
 
-        # convert to embedding
-        embedding_response = await litellm.aembedding(
-            model="text-embedding-ada-002",
-            input=prompt,
-            cache={"no-store": True, "no-cache": True},
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
         )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
 
         # get the embedding
         embedding = embedding_response["data"][0]["embedding"]
@@ -727,6 +755,7 @@ class Cache:
         s3_aws_session_token: Optional[str] = None,
         s3_config: Optional[Any] = None,
         redis_semantic_cache_use_async=False,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
         **kwargs,
     ):
         """
@@ -757,6 +786,7 @@ class Cache:
                 password,
                 similarity_threshold=similarity_threshold,
                 use_async=redis_semantic_cache_use_async,
+                embedding_model=redis_semantic_cache_embedding_model,
                 **kwargs,
             )
         elif type == "local":

From ca2c9490dc4dfdce3894a122e107f42260262b4e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:27:33 -0800
Subject: [PATCH 192/218] (fix) use semantic cache on proxy

---
 litellm/proxy/proxy_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 41c3b41828..326544f41e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -77,7 +77,7 @@ litellm_settings:
   cache_params:
     type: "redis-semantic"
     similarity_threshold: 0.8
-    redis_semantic_cache_use_async: True
+    redis_semantic_cache_embedding_model: azure-embedding-model
   # cache: True
   # setting callback class
   # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]

From f042c33c0c16e616abb81afed4758e6b4556944e Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:32:07 -0800
Subject: [PATCH 193/218] (docs) using semantic caching on proxy

---
 docs/my-website/docs/proxy/caching.md | 52 ++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 03bb9fed34..3f26878241 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -9,7 +9,7 @@ LiteLLM supports:
 - Redis Cache 
 - s3 Bucket Cache 
 
-## Quick Start - Redis, s3 Cache
+## Quick Start - Redis, s3 Cache, Semantic Cache
 <Tabs>
 
 <TabItem value="redis" label="redis cache">
@@ -84,6 +84,56 @@ litellm_settings:
 $ litellm --config /path/to/config.yaml
 ```
 </TabItem>
+
+
+<TabItem value="redis-sem" label="redis semantic cache">
+
+Caching can be enabled by adding the `cache` key in the `config.yaml`
+
+### Step 1: Add `cache` to the config.yaml
+```yaml
+model_list:
+  - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+  - model_name: azure-embedding-model
+    litellm_params:
+      model: azure/azure-embedding-model
+      api_base: os.environ/AZURE_API_BASE
+      api_key: os.environ/AZURE_API_KEY
+      api_version: "2023-07-01-preview"
+
+litellm_settings:
+  set_verbose: True
+  cache: True          # set cache responses to True, litellm defaults to using a redis cache
+  cache_params:
+    type: "redis-semantic"  
+    similarity_threshold: 0.8   # similarity threshold for semantic cache
+    redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
+```
+
+### Step 2: Add Redis Credentials to .env
+Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
+
+  ```shell
+  REDIS_URL = ""        # REDIS_URL='redis://username:password@hostname:port/database'
+  ## OR ## 
+  REDIS_HOST = ""       # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com'
+  REDIS_PORT = ""       # REDIS_PORT='18841'
+  REDIS_PASSWORD = ""   # REDIS_PASSWORD='liteLlmIsAmazing'
+  ```
+
+**Additional kwargs**  
+You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: 
+```shell
+REDIS_<redis-kwarg-name> = ""
+``` 
+
+### Step 3: Run proxy with config
+```shell
+$ litellm --config /path/to/config.yaml
+```
+</TabItem>
 </Tabs>
 
 

From 4fac36f19bb78de4e22e5adc0f5e409cf75cce13 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:35:21 -0800
Subject: [PATCH 194/218] (feat) redis-semantic cache on proxy

---
 litellm/proxy/proxy_server.py | 5 ++++-
 requirements.txt              | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 494c874147..661e932f37 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1168,7 +1168,7 @@ class ProxyConfig:
 
                     verbose_proxy_logger.debug(f"passed cache type={cache_type}")
 
-                    if cache_type == "redis":
+                    if cache_type == "redis" or cache_type == "redis-semantic":
                         cache_host = litellm.get_secret("REDIS_HOST", None)
                         cache_port = litellm.get_secret("REDIS_PORT", None)
                         cache_password = litellm.get_secret("REDIS_PASSWORD", None)
@@ -1195,6 +1195,9 @@ class ProxyConfig:
                             f"{blue_color_code}Cache Password:{reset_color_code} {cache_password}"
                         )
                         print()  # noqa
+                    if cache_type == "redis-semantic":
+                        # by default this should always be async
+                        cache_params.update({"redis_semantic_cache_use_async": True})
 
                     # users can pass os.environ/ variables on the proxy - we should read them from the env
                     for key, value in cache_params.items():
diff --git a/requirements.txt b/requirements.txt
index b0a49553d1..3ace5872ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==4.6.0 # caching
 redisvl==0.0.7 # semantic caching
+numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
 google-generativeai==0.3.2 # for vertex ai calls

From ba8b49311ed816d5da8c7e50981c2aceda03c638 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:39:44 -0800
Subject: [PATCH 195/218] (fix) test-semantic caching

---
 litellm/tests/test_caching.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index a1a42ff659..cc18dda165 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1019,8 +1019,20 @@ def test_redis_semantic_cache_completion():
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8p5GejSWLJ1pDI1lfhc6Idhwd2bDJ"
-    # assert response1.choices[0].message == 1
+    random_number = random.randint(1, 100000)
+
+    response2 = completion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=20,
+    )
+    print(f"response2: {response1}")
+    assert response1.id == response2.id
 
 
 # test_redis_cache_completion()
@@ -1054,8 +1066,20 @@ async def test_redis_semantic_cache_acompletion():
                 "content": f"write a one sentence poem about: {random_number}",
             }
         ],
-        max_tokens=20,
+        max_tokens=5,
     )
     print(f"response1: {response1}")
 
-    assert response1.id == "chatcmpl-8pI86yvT7fvgLDjngZSKULy1iP1o5"
+    random_number = random.randint(1, 100000)
+    response2 = await litellm.acompletion(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "user",
+                "content": f"write a one sentence poem about: {random_number}",
+            }
+        ],
+        max_tokens=5,
+    )
+    print(f"response2: {response2}")
+    assert response1.id == response2.id

From de0c4c2dc57872325ce7376c97e8179b4784c904 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:53:28 -0800
Subject: [PATCH 196/218] (docs) redis cache

---
 docs/my-website/docs/caching/redis_cache.md | 68 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 8a580f087c..7b21d35b6c 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -1,11 +1,11 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
-# Caching - In-Memory, Redis, s3
+# Caching - In-Memory, Redis, s3,  Redis Semantic Cache
 
 [**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching.py)
 
-## Initialize Cache - In Memory, Redis, s3 Bucket
+## Initialize Cache - In Memory, Redis, s3 Bucket, Redis Semantic Cache
 
 
 <Tabs>
@@ -18,7 +18,7 @@ pip install redis
 ```
 
 For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -55,7 +55,7 @@ Set AWS environment variables
 AWS_ACCESS_KEY_ID = "AKI*******"
 AWS_SECRET_ACCESS_KEY = "WOl*****"
 ```
-### Quick Start
+
 ```python
 import litellm
 from litellm import completion
@@ -80,6 +80,66 @@ response2 = completion(
 </TabItem>
 
 
+<TabItem value="redis-sem" label="redis-semantic cache">
+
+Install redis
+```shell
+pip install redisvl==0.0.7
+```
+
+For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/
+
+```python
+import litellm
+from litellm import completion
+from litellm.caching import Cache
+
+random_number = random.randint(
+    1, 100000
+)  # add a random number to ensure it's always adding / reading from cache
+
+print("testing semantic caching")
+litellm.cache = Cache(
+    type="redis-semantic",
+    host=os.environ["REDIS_HOST"],
+    port=os.environ["REDIS_PORT"],
+    password=os.environ["REDIS_PASSWORD"],
+    similarity_threshold=0.8,
+    redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+)
+response1 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response1: {response1}")
+
+random_number = random.randint(1, 100000)
+
+response2 = completion(
+    model="gpt-3.5-turbo",
+    messages=[
+        {
+            "role": "user",
+            "content": f"write a one sentence poem about: {random_number}",
+        }
+    ],
+    max_tokens=20,
+)
+print(f"response2: {response1}")
+assert response1.id == response2.id
+# response1 == response2, response 1 is cached
+```
+
+</TabItem>
+
+
+
 <TabItem value="in-mem" label="in memory cache">
 
 ### Quick Start

From e7a3302c1eb1dc182cd93c8ed4188cfaff0823fe Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:54:55 -0800
Subject: [PATCH 197/218] (docs) litellm semantic caching

---
 docs/my-website/docs/caching/redis_cache.md | 2 +-
 docs/my-website/docs/proxy/caching.md       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/my-website/docs/caching/redis_cache.md b/docs/my-website/docs/caching/redis_cache.md
index 7b21d35b6c..75e1db9557 100644
--- a/docs/my-website/docs/caching/redis_cache.md
+++ b/docs/my-website/docs/caching/redis_cache.md
@@ -104,7 +104,7 @@ litellm.cache = Cache(
     host=os.environ["REDIS_HOST"],
     port=os.environ["REDIS_PORT"],
     password=os.environ["REDIS_PASSWORD"],
-    similarity_threshold=0.8,
+    similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
 )
 response1 = completion(
diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 3f26878241..d5b589e5c2 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -7,6 +7,7 @@ Cache LLM Responses
 LiteLLM supports:
 - In Memory Cache
 - Redis Cache 
+- Redis Semantic Cache
 - s3 Bucket Cache 
 
 ## Quick Start - Redis, s3 Cache, Semantic Cache

From cb46db684c2a290c5b4256fa79e58cc04ead5ac2 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 10:55:15 -0800
Subject: [PATCH 198/218] (fix) semantic caching

---
 litellm/tests/test_caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index cc18dda165..96fd8eb9d2 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1006,6 +1006,7 @@ def test_redis_semantic_cache_completion():
         port=os.environ["REDIS_PORT"],
         password=os.environ["REDIS_PASSWORD"],
         similarity_threshold=0.8,
+        redis_semantic_cache_embedding_model="text-embedding-ada-002",
     )
     response1 = completion(
         model="gpt-3.5-turbo",

From a3e54117530834df54001f58f19c719fc83e716f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 11:04:19 -0800
Subject: [PATCH 199/218] (fix) mark semantic caching as beta test

---
 litellm/tests/test_caching.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 96fd8eb9d2..6cb5b974a1 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -989,6 +989,7 @@ def test_cache_context_managers():
 # test_cache_context_managers()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 def test_redis_semantic_cache_completion():
     litellm.set_verbose = True
     import logging
@@ -1039,6 +1040,7 @@ def test_redis_semantic_cache_completion():
 # test_redis_cache_completion()
 
 
+@pytest.mark.skip(reason="beta test - new redis semantic cache")
 @pytest.mark.asyncio
 async def test_redis_semantic_cache_acompletion():
     litellm.set_verbose = True

From 2627b72e2247d1a0ed37674e05448f7f228636a0 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:26:48 -0800
Subject: [PATCH 200/218] (ci/cd) run again

---
 litellm/tests/test_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index 6cb5b974a1..8433941e90 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -998,7 +998,7 @@ def test_redis_semantic_cache_completion():
 
     random_number = random.randint(
         1, 100000
-    )  # add a random number to ensure it's always adding / reading from cache
+    )  # add a random number to ensure it's always adding /reading from cache
 
     print("testing semantic caching")
     litellm.cache = Cache(

From 603e540f24b83f8b2a6d37e82c0ee92d91134f62 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:29:31 -0800
Subject: [PATCH 201/218] test(test_completion.py): fix test

---
 docs/my-website/docs/proxy/caching.md | 7 ++++---
 litellm/tests/test_completion.py      | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index d5b589e5c2..2b385de8e5 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -211,9 +211,10 @@ litellm_settings:
 
 The proxy support 3 cache-controls:
 
-- `ttl`: Will cache the response for the user-defined amount of time (in seconds).
-- `s-maxage`: Will only accept cached responses that are within user-defined range (in seconds).
-- `no-cache`: Will not return a cached response, but instead call the actual endpoint. 
+- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds).
+- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds).
+- `no-cache`: *Optional(bool)* Will not return a cached response, but instead call the actual endpoint. 
+- `no-store`: *Optional(bool)* Will not cache the response. 
 
 [Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218)
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index de79c97afa..b075e48190 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -565,7 +565,6 @@ def test_completion_openai():
         assert len(response_str) > 1
 
         litellm.api_key = None
-        raise Exception("it works!")
     except Timeout as e:
         pass
     except Exception as e:

From c3ebb8e669574bd3a1214ed045f5e9fbe9d802a3 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:35:25 -0800
Subject: [PATCH 202/218] (feat) show semantic-cache on health/readiness

---
 litellm/caching.py            |  3 +++
 litellm/proxy/proxy_server.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/litellm/caching.py b/litellm/caching.py
index 6bf53ea451..f996a58735 100644
--- a/litellm/caching.py
+++ b/litellm/caching.py
@@ -531,6 +531,9 @@ class RedisSemanticCache(BaseCache):
             return None
         pass
 
+    async def _index_info(self):
+        return await self.index.ainfo()
+
 
 class S3Cache(BaseCache):
     def __init__(
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 661e932f37..427bb88a9c 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -4051,8 +4051,18 @@ async def health_readiness():
 
     cache_type = None
     if litellm.cache is not None:
+        from litellm.caching import RedisSemanticCache
+
         cache_type = litellm.cache.type
 
+        if isinstance(litellm.cache.cache, RedisSemanticCache):
+            # ping the cache
+            try:
+                index_info = await litellm.cache.cache._index_info()
+            except Exception as e:
+                index_info = "index does not exist - error: " + str(e)
+            cache_type = {"type": cache_type, "index_info": index_info}
+
     if prisma_client is not None:  # if db passed in, check if it's connected
         if prisma_client.db.is_connected() == True:
             response_object = {"db": "connected"}

From b135c29fe9f724cc48af99ecd6345d049679bcdd Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:36:35 -0800
Subject: [PATCH 203/218] (fix) dockerfile requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3ace5872ad..f2bff2680b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ pyyaml>=6.0.1 # server dep
 uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
-redis==4.6.0 # caching
+redis==5.0.0 # caching
 redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db

From 0a076c46a1fd14fb254214018c865d550fe5935b Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:41:22 -0800
Subject: [PATCH 204/218] (fix) langfuse show semantic-similarity in tags

---
 litellm/integrations/langfuse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 3c3e793dfb..3031868ec7 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -255,6 +255,7 @@ class LangFuseLogger:
                     if key in [
                         "user_api_key",
                         "user_api_key_user_id",
+                        "semantic-similarity",
                     ]:
                         tags.append(f"{key}:{value}")
                 if "cache_hit" in kwargs:

From d1e23e6552f5b2aad0f58ddb6bfb7d6dd437f8f1 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 13:52:32 -0800
Subject: [PATCH 205/218] (fix) redisvl requirements.txt issue

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f2bff2680b..55c5f14568 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,6 @@ uvicorn==0.22.0 # server dep
 gunicorn==21.2.0 # server dep
 boto3==1.28.58 # aws bedrock/sagemaker calls
 redis==5.0.0 # caching
-redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions

From 66b32c2df65a4628af8ace967df3c5f0a30b5f19 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 13:55:51 -0800
Subject: [PATCH 206/218] refactor(main.py): trigger deploy

n
---
 litellm/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/litellm/main.py b/litellm/main.py
index 384dadc32d..b18221607f 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -10,7 +10,6 @@
 import os, openai, sys, json, inspect, uuid, datetime, threading
 from typing import Any, Literal, Union
 from functools import partial
-
 import dotenv, traceback, random, asyncio, time, contextvars
 from copy import deepcopy
 import httpx

From 4de77018cc79e871f409b66fb5d624af4ce1d241 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 15:19:36 -0800
Subject: [PATCH 207/218] (feat) support max_user_budget

---
 litellm/proxy/proxy_server.py | 31 +++++++++++++++++++++++++------
 litellm/proxy/utils.py        | 22 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 427bb88a9c..e97ae734fe 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -91,6 +91,7 @@ from litellm.proxy.utils import (
     reset_budget,
     hash_token,
     html_form,
+    _read_request_body,
 )
 from litellm.proxy.secret_managers.google_kms import load_google_kms
 import pydantic
@@ -370,8 +371,9 @@ async def user_api_key_auth(
             # Run checks for
             # 1. If token can call model
             # 2. If user_id for this token is in budget
-            # 3. If token is expired
-            # 4. If token spend is under Budget for the token
+            # 3. If 'user' passed to /chat/completions, /embeddings endpoint is in budget
+            # 4. If token is expired
+            # 5. If token spend is under Budget for the token
 
             # Check 1. If token can call model
             litellm.model_alias_map = valid_token.aliases
@@ -430,11 +432,24 @@ async def user_api_key_auth(
                 )
 
             # Check 2. If user_id for this token is in budget
-            ## Check 2.5 If global proxy is in budget
+            ## Check 2.1 If global proxy is in budget
+            ## Check 2.2 [OPTIONAL - checked only if litellm.max_user_budget is not None] If 'user' passed in /chat/completions is in budget
             if valid_token.user_id is not None:
+                user_id_list = [
+                    valid_token.user_id,
+                    litellm_proxy_budget_name,
+                ]
+                if (
+                    litellm.max_user_budget is not None
+                ):  # Check if 'user' passed in /chat/completions is in budget, only checked if litellm.max_user_budget is set
+                    request_data = await _read_request_body(request=request)
+                    user_passed_to_chat_completions = request_data.get("user", None)
+                    if user_passed_to_chat_completions is not None:
+                        user_id_list.append(user_passed_to_chat_completions)
+
                 if prisma_client is not None:
                     user_id_information = await prisma_client.get_data(
-                        user_id_list=[valid_token.user_id, litellm_proxy_budget_name],
+                        user_id_list=user_id_list,
                         table_name="user",
                         query_type="find_all",
                     )
@@ -459,7 +474,7 @@ async def user_api_key_auth(
                             user_current_spend = _user.get("spend", None)
 
                             verbose_proxy_logger.debug(
-                                f"user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
+                                f"user_id: {_user.get('user_id', None)}; user_max_budget: {user_max_budget}; user_current_spend: {user_current_spend}"
                             )
 
                             if (
@@ -852,9 +867,13 @@ async def update_database(
                     f"Updating existing_spend_obj: {existing_spend_obj}"
                 )
                 if existing_spend_obj is None:
+                    # if user does not exist in LiteLLM_UserTable, create a new user
                     existing_spend = 0
+                    max_user_budget = None
+                    if litellm.max_user_budget is not None:
+                        max_user_budget = litellm.max_user_budget
                     existing_spend_obj = LiteLLM_UserTable(
-                        user_id=id, spend=0, max_budget=None, user_email=None
+                        user_id=id, spend=0, max_budget=max_user_budget, user_email=None
                     )
                 else:
                     existing_spend = existing_spend_obj.spend
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 84b09d7265..b28f887eff 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -1213,6 +1213,28 @@ async def reset_budget(prisma_client: PrismaClient):
             )
 
 
+async def _read_request_body(request):
+    """
+    Asynchronous function to read the request body and parse it as JSON or literal data.
+
+    Parameters:
+    - request: The request object to read the body from
+
+    Returns:
+    - dict: Parsed request data as a dictionary
+    """
+    import ast, json
+
+    request_data = {}
+    body = await request.body()
+    body_str = body.decode()
+    try:
+        request_data = ast.literal_eval(body_str)
+    except:
+        request_data = json.loads(body_str)
+    return request_data
+
+
 # LiteLLM Admin UI - Non SSO Login
 html_form = """
 <!DOCTYPE html>

From 3dc85a526370b9146908a77d048151f6645de462 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 15:25:51 -0800
Subject: [PATCH 208/218] (test) track_cost_ for end users

---
 litellm/tests/test_key_generate_prisma.py | 81 +++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py
index b4c86afb25..d4f405b7b0 100644
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@@ -322,6 +322,87 @@ def test_call_with_user_over_budget(prisma_client):
         print(vars(e))
 
 
+def test_call_with_end_user_over_budget(prisma_client):
+    # Test if a user passed to /chat/completions is tracked & fails whe they cross their budget
+    # we only check this when litellm.max_user_budget is set
+    import random
+
+    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
+    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
+    setattr(litellm, "max_user_budget", 0.00001)
+    try:
+
+        async def test():
+            await litellm.proxy.proxy_server.prisma_client.connect()
+            request = GenerateKeyRequest()  # create a key with no budget
+            key = await new_user(request)
+            print(key)
+
+            generated_key = key.key
+            bearer_token = "Bearer " + generated_key
+            user = f"ishaan {random.randint(0, 10000)}"
+            request = Request(scope={"type": "http"})
+            request._url = URL(url="/chat/completions")
+
+            async def return_body():
+                return_string = f'{{"model": "gemini-pro-vision", "user": "{user}"}}'
+                # return string as bytes
+                return return_string.encode()
+
+            request.body = return_body
+
+            # update spend using track_cost callback, make 2nd request, it should fail
+            from litellm.proxy.proxy_server import (
+                _PROXY_track_cost_callback as track_cost_callback,
+            )
+            from litellm import ModelResponse, Choices, Message, Usage
+
+            resp = ModelResponse(
+                id="chatcmpl-e41836bb-bb8b-4df2-8e70-8f3e160155ac",
+                choices=[
+                    Choices(
+                        finish_reason=None,
+                        index=0,
+                        message=Message(
+                            content=" Sure! Here is a short poem about the sky:\n\nA canvas of blue, a",
+                            role="assistant",
+                        ),
+                    )
+                ],
+                model="gpt-35-turbo",  # azure always has model written like this
+                usage=Usage(prompt_tokens=210, completion_tokens=200, total_tokens=410),
+            )
+            await track_cost_callback(
+                kwargs={
+                    "stream": False,
+                    "litellm_params": {
+                        "metadata": {
+                            "user_api_key": generated_key,
+                            "user_api_key_user_id": user,
+                        },
+                        "proxy_server_request": {
+                            "user": user,
+                        },
+                    },
+                    "response_cost": 10,
+                },
+                completion_response=resp,
+                start_time=datetime.now(),
+                end_time=datetime.now(),
+            )
+            await asyncio.sleep(5)
+            # use generated key to auth in
+            result = await user_api_key_auth(request=request, api_key=bearer_token)
+            print("result from user auth with new key", result)
+            pytest.fail(f"This should have failed!. They key crossed it's budget")
+
+        asyncio.run(test())
+    except Exception as e:
+        error_detail = e.message
+        assert "Authentication Error, ExceededBudget:" in error_detail
+        print(vars(e))
+
+
 def test_call_with_proxy_over_budget(prisma_client):
     # 5.1 Make a call with a proxy over budget, expect to fail
     setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)

From cfd46738ed9e531217f1dac664b0bc4b26543281 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 15:39:45 -0800
Subject: [PATCH 209/218] (docs) budget per end_user

---
 docs/my-website/docs/proxy/users.md | 56 +++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/docs/my-website/docs/proxy/users.md b/docs/my-website/docs/proxy/users.md
index c5f2ca358c..baca0188e8 100644
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@@ -13,6 +13,7 @@ Requirements:
 You can set budgets at 3 levels: 
 - For the proxy 
 - For a user 
+- For a 'user' passed to `/chat/completions`, `/embeddings` etc
 - For a key
 
 
@@ -117,6 +118,61 @@ curl --location 'http://0.0.0.0:8000/key/generate' \
 --data '{"models": ["azure-models"], "user_id": "krrish3@berri.ai"}'
 ```
 
+</TabItem>
+<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
+
+Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
+
+**Step 1. Modify config.yaml**
+Define `litellm.max_user_budget`
+```yaml
+general_settings:
+  master_key: sk-1234
+
+litellm_settings:
+  max_budget: 10      # global budget for proxy 
+  max_user_budget: 0.0001 # budget for 'user' passed to /chat/completions
+```
+
+2. Make a /chat/completions call, pass 'user' - First call Works 
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+3. Make a /chat/completions call, pass 'user' - Call Fails, since 'ishaan3' over budget
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+        --header 'Content-Type: application/json' \
+        --header 'Authorization: Bearer sk-zi5onDRdHGD24v0Zdn7VBA' \
+        --data ' {
+        "model": "azure-gpt-3.5",
+        "user": "ishaan3",
+        "messages": [
+            {
+            "role": "user",
+            "content": "what time is it"
+            }
+        ]
+        }'
+```
+
+Error
+```shell
+{"error":{"message":"Authentication Error, ExceededBudget: User ishaan3 has exceeded their budget. Current spend: 0.0008869999999999999; Max Budget: 0.0001","type":"auth_error","param":"None","code":401}}%                
+```
+
 </TabItem>
 <TabItem value="per-key" label="For Key">
 

From 3017377740bb46c35f83a3048e98e13de382b8e8 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 16:08:25 -0800
Subject: [PATCH 210/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 80a4372a57..6eac2ebf5f 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -1743,7 +1743,7 @@ def test_azure_cloudflare_api():
 
 def test_completion_anyscale_2():
     try:
-        # litellm.set_verbose=True
+        # litellm.set_verbose= True
         messages = [
             {"role": "system", "content": "You're a good bot"},
             {

From 0908798a2cefba192598b22da08855d7b1d354e5 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 17:12:46 -0800
Subject: [PATCH 211/218] =?UTF-8?q?bump:=20version=201.22.9=20=E2=86=92=20?=
 =?UTF-8?q?1.22.10?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 944aad7f8b..1961ccbbea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.9"
+version = "1.22.10"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.9"
+version = "1.22.10"
 version_files = [
     "pyproject.toml:^version"
 ]

From fc50abe6d157ee581e69ad82a534965d133ba936 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 18:25:15 -0800
Subject: [PATCH 212/218] (ci/cd) run again

---
 litellm/tests/test_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 6eac2ebf5f..80a4372a57 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -1743,7 +1743,7 @@ def test_azure_cloudflare_api():
 
 def test_completion_anyscale_2():
     try:
-        # litellm.set_verbose= True
+        # litellm.set_verbose=True
         messages = [
             {"role": "system", "content": "You're a good bot"},
             {

From 756f902b6f66d3ccb12289c8dec2a452427e385f Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 19:23:27 -0800
Subject: [PATCH 213/218] (fix) dockerfile for semantic caching

---
 Dockerfile          | 3 +++
 Dockerfile.database | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 0de52b8e7e..0c3a2a33c0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,6 +32,9 @@ RUN pip install dist/*.whl
 # install dependencies as wheels
 RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt
 
+# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
+RUN pip install redisvl==0.0.7 --no-deps
+
 # Runtime stage
 FROM $LITELLM_RUNTIME_IMAGE as runtime
 
diff --git a/Dockerfile.database b/Dockerfile.database
index 9e9fb7fc44..8ceafc718d 100644
--- a/Dockerfile.database
+++ b/Dockerfile.database
@@ -47,6 +47,9 @@ COPY --from=builder /wheels/ /wheels/
 # Install the built wheel using pip; again using a wildcard if it's the only file
 RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels
 
+# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 
+RUN pip install redisvl==0.0.7 --no-deps
+
 # Generate prisma client
 RUN prisma generate
 RUN chmod +x entrypoint.sh

From 28f100c53e00e01836db4a9e856c2f1f7d917041 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 19:23:57 -0800
Subject: [PATCH 214/218] =?UTF-8?q?bump:=20version=201.22.10=20=E2=86=92?=
 =?UTF-8?q?=201.22.11?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1961ccbbea..fdcdede2a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.10"
+version = "1.22.11"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.22.10"
+version = "1.22.11"
 version_files = [
     "pyproject.toml:^version"
 ]

From 80f24fc29c1c66a3cf17dfb06c9c45212f51be47 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 19:29:39 -0800
Subject: [PATCH 215/218] (fix) allow litellm_settings to be None

---
 litellm/proxy/proxy_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 7605ad79a3..d34bdd8c9d 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1124,7 +1124,7 @@ class ProxyConfig:
         # load existing config
         config = await self.get_config()
         ## LITELLM MODULE SETTINGS (e.g. litellm.drop_params=True,..)
-        litellm_settings = config.get("litellm_settings", None)
+        litellm_settings = config.get("litellm_settings", {})
         all_teams_config = litellm_settings.get("default_team_settings", None)
         team_config: dict = {}
         if all_teams_config is None:

From 46153724d27e03e475d5a28b55453c7786dfb968 Mon Sep 17 00:00:00 2001
From: ishaan-jaff <ishaanjaffer0324@gmail.com>
Date: Tue, 6 Feb 2024 19:39:32 -0800
Subject: [PATCH 216/218] (docs) caching

---
 docs/my-website/docs/proxy/caching.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
index 2b385de8e5..50aba03db5 100644
--- a/docs/my-website/docs/proxy/caching.md
+++ b/docs/my-website/docs/proxy/caching.md
@@ -17,7 +17,7 @@ LiteLLM supports:
 
 Caching can be enabled by adding the `cache` key in the `config.yaml`
 
-### Step 1: Add `cache` to the config.yaml
+#### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
   - model_name: gpt-3.5-turbo
@@ -32,7 +32,7 @@ litellm_settings:
   cache: True          # set cache responses to True, litellm defaults to using a redis cache
 ```
 
-### Step 2: Add Redis Credentials to .env
+#### Step 2: Add Redis Credentials to .env
 Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
 
   ```shell
@@ -50,7 +50,7 @@ REDIS_<redis-kwarg-name> = ""
 ``` 
 
 [**See how it's read from the environment**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/_redis.py#L40)
-### Step 3: Run proxy with config
+#### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
@@ -58,7 +58,7 @@ $ litellm --config /path/to/config.yaml
 
 <TabItem value="s3" label="s3 cache">
 
-### Step 1: Add `cache` to the config.yaml
+#### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
   - model_name: gpt-3.5-turbo
@@ -80,7 +80,7 @@ litellm_settings:
     s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
 ```
 
-### Step 2: Run proxy with config
+#### Step 2: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```
@@ -91,7 +91,7 @@ $ litellm --config /path/to/config.yaml
 
 Caching can be enabled by adding the `cache` key in the `config.yaml`
 
-### Step 1: Add `cache` to the config.yaml
+#### Step 1: Add `cache` to the config.yaml
 ```yaml
 model_list:
   - model_name: gpt-3.5-turbo
@@ -113,7 +113,7 @@ litellm_settings:
     redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list
 ```
 
-### Step 2: Add Redis Credentials to .env
+#### Step 2: Add Redis Credentials to .env
 Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.
 
   ```shell
@@ -130,7 +130,7 @@ You can pass in any additional redis.Redis arg, by storing the variable + value
 REDIS_<redis-kwarg-name> = ""
 ``` 
 
-### Step 3: Run proxy with config
+#### Step 3: Run proxy with config
 ```shell
 $ litellm --config /path/to/config.yaml
 ```

From c43baff5c9640826026a0122ecbdf1931e527a40 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 20:10:19 -0800
Subject: [PATCH 217/218] fix: export npm build into proxy

---
 litellm/proxy/_experimental/out/404.html                    | 2 +-
 ...ayout-4d667c133e03c98b.js => layout-ea657eeec2abf062.js} | 2 +-
 .../out/_next/static/chunks/app/page-7f03ccc8529ada97.js    | 1 +
 .../out/_next/static/chunks/app/page-992f4cdd1053ee86.js    | 1 -
 .../out/_next/static/chunks/main-app-096338c8e1915716.js    | 2 +-
 .../out/_next/static/chunks/main-app-9b4fb13a7db53edf.js    | 1 -
 .../_buildManifest.js                                       | 0
 .../_ssgManifest.js                                         | 0
 litellm/proxy/_experimental/out/index.html                  | 2 +-
 litellm/proxy/_experimental/out/index.txt                   | 6 +++---
 litellm/proxy/proxy_server.py                               | 6 +++++-
 ui/litellm-dashboard/out/404.html                           | 2 +-
 .../out/_next/static/chunks/app/layout-4d667c133e03c98b.js  | 1 -
 .../out/_next/static/chunks/app/page-992f4cdd1053ee86.js    | 1 -
 .../_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js    | 1 -
 .../out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js  | 1 -
 ui/litellm-dashboard/out/index.html                         | 2 +-
 ui/litellm-dashboard/out/index.txt                          | 6 +++---
 ui/litellm-dashboard/src/components/networking.tsx          | 2 +-
 19 files changed, 19 insertions(+), 20 deletions(-)
 rename litellm/proxy/_experimental/out/_next/static/chunks/app/{layout-4d667c133e03c98b.js => layout-ea657eeec2abf062.js} (60%)
 create mode 100644 litellm/proxy/_experimental/out/_next/static/chunks/app/page-7f03ccc8529ada97.js
 delete mode 100644 litellm/proxy/_experimental/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
 rename ui/litellm-dashboard/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js => litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js (54%)
 delete mode 100644 litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
 rename litellm/proxy/_experimental/out/_next/static/{lGjwnJSGwBqa476jHHI8W => p5gDwQBbgW8D3Uz3lgoZg}/_buildManifest.js (100%)
 rename litellm/proxy/_experimental/out/_next/static/{lGjwnJSGwBqa476jHHI8W => p5gDwQBbgW8D3Uz3lgoZg}/_ssgManifest.js (100%)
 delete mode 100644 ui/litellm-dashboard/out/_next/static/chunks/app/layout-4d667c133e03c98b.js
 delete mode 100644 ui/litellm-dashboard/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
 delete mode 100644 ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js
 delete mode 100644 ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js

diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404.html
index 1ec6cd9a4f..c57eb5193a 100644
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
@@ -1 +1 @@
-<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>404: This page could not be found.</title><title>Create Next App</title><meta name="description" content="Generated by create next app"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[5613,[],\"\"]\n7:I[31778,[],\"\"]\nd:I[48955,[],\"\"]\n8:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n9:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\na:{\"display\":\"inline-block\"}\nb:{\"fontSize\":14,\"fontWe"])</script><script>self.__next_f.push([1,"ight\":400,\"lineHeight\":\"49px\",\"margin\":0}\ne:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"lGjwnJSGwBqa476jHHI8W\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$9\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$a\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$b\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$Lc\"],\"globalErrorComponent\":\"$d\",\"missingSlots\":\"$We\"}]]\n"])</script><script>self.__next_f.push([1,"c:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"Create Next App\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Generated by create next app\"}],[\"$\",\"meta\",\"4\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
+<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>404: This page could not be found.</title><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[5613,[],\"\"]\n7:I[31778,[],\"\"]\nd:I[48955,[],\"\"]\n8:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n9:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\na:{\"display\":\"inline-block\"}\nb:{\"fontSize\":14,\"fontWe"])</script><script>self.__next_f.push([1,"ight\":400,\"lineHeight\":\"49px\",\"margin\":0}\ne:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"p5gDwQBbgW8D3Uz3lgoZg\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$9\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$a\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$b\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$Lc\"],\"globalErrorComponent\":\"$d\",\"missingSlots\":\"$We\"}]]\n"])</script><script>self.__next_f.push([1,"c:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"meta\",\"4\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-4d667c133e03c98b.js b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ea657eeec2abf062.js
similarity index 60%
rename from litellm/proxy/_experimental/out/_next/static/chunks/app/layout-4d667c133e03c98b.js
rename to litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ea657eeec2abf062.js
index e261adc052..fe5260febf 100644
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-4d667c133e03c98b.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ea657eeec2abf062.js
@@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
\ No newline at end of file
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{11837:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=11837)}),_N_E=n.O()}]);
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7f03ccc8529ada97.js b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7f03ccc8529ada97.js
new file mode 100644
index 0000000000..c8c53fcee1
--- /dev/null
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-7f03ccc8529ada97.js
@@ -0,0 +1 @@
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[931],{27376:function(e,t,l){Promise.resolve().then(l.bind(l,27680))},27680:function(e,t,l){"use strict";l.r(t),l.d(t,{default:function(){return B}});var s=l(3827),r=l(64090),a=l(80588);let n=async(e,t,l)=>{try{if(console.log("Form Values in keyCreateCall:",l),l.description&&(l.metadata||(l.metadata={}),l.metadata.description=l.description,delete l.description,l.metadata=JSON.stringify(l.metadata)),l.metadata){console.log("formValues.metadata:",l.metadata);try{l.metadata=JSON.parse(l.metadata)}catch(e){throw a.ZP.error("Failed to parse metadata: "+e),Error("Failed to parse metadata: "+e)}}console.log("Form Values after check:",l);let s=await fetch("/key/generate",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({user_id:t,...l})});if(!s.ok){let e=await s.text();throw a.ZP.error("Failed to create key: "+e),console.error("Error response from the server:",e),Error("Network response was not ok")}let r=await s.json();return console.log("API Response:",r),r}catch(e){throw console.error("Failed to create key:",e),e}},o=async(e,t)=>{try{console.log("in keyDeleteCall:",t),a.ZP.info("Making key delete request");let l=await fetch("/key/delete",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({keys:[t]})});if(!l.ok){let e=await l.text();throw a.ZP.error("Failed to delete key: "+e),Error("Network response was not ok")}let s=await l.json();return console.log(s),a.ZP.success("API Key Deleted"),s}catch(e){throw console.error("Failed to create key:",e),e}},i=async(e,t,l)=>{try{let s="/user/info";"App Owner"==l&&(s="".concat(s,"/?user_id=").concat(t)),a.ZP.info("Requesting user data");let r=await fetch(s,{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!r.ok){let e=await r.text();throw a.ZP.error(e),Error("Network response was not ok")}let n=await r.json();return a.ZP.info("Received user data"),n}catch(e){throw console.error("Failed to create key:",e),e}},c=async(e,t)=>{try{let l="/spend/logs";console.log("in keySpendLogsCall:",l);let s=await fetch("".concat(l,"/?api_key=").concat(t),{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!s.ok){let e=await s.text();throw a.ZP.error(e),Error("Network response was not ok")}let r=await s.json();return console.log(r),r}catch(e){throw console.error("Failed to create key:",e),e}};var d=l(10384),h=l(46453),u=l(2179),m=l(71801),x=l(96776),j=l(2902),p=l(77171),y=l(29714),Z=l(88707),g=l(1861);let{Option:f}=x.default;var k=e=>{let{userID:t,userRole:l,accessToken:o,data:i,setData:c}=e,[x]=j.Z.useForm(),[f,k]=(0,r.useState)(!1),[w,b]=(0,r.useState)(null),S=()=>{k(!1),x.resetFields()},N=()=>{k(!1),b(null),x.resetFields()},_=async e=>{try{a.ZP.info("Making API Call"),e.models&&""!==e.models.trim()?e.models=e.models.split(",").map(e=>e.trim()):e.models=[],k(!0);let l=await n(o,t,e);c(e=>e?[...e,l]:[l]),b(l.key),a.ZP.success("API Key Created"),x.resetFields()}catch(e){console.error("Error creating the key:",e)}};return(0,s.jsxs)("div",{children:[(0,s.jsx)(u.Z,{className:"mx-auto",onClick:()=>k(!0),children:"+ Create New Key"}),(0,s.jsx)(p.Z,{title:"Create Key",visible:f,width:800,footer:null,onOk:S,onCancel:N,children:(0,s.jsxs)(j.Z,{form:x,onFinish:_,labelCol:{span:6},wrapperCol:{span:16},labelAlign:"left",children:["App Owner"===l||"Admin"===l?(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,s.jsx)(y.Z,{})}),(0,s.jsx)(j.Z.Item,{label:"Team ID",name:"team_id",children:(0,s.jsx)(y.Z,{placeholder:"ai_team"})}),(0,s.jsx)(j.Z.Item,{label:"Models (Comma Separated). Eg: gpt-3.5-turbo,gpt-4",name:"models",children:(0,s.jsx)(y.Z,{placeholder:"gpt-4,gpt-3.5-turbo"})}),(0,s.jsx)(j.Z.Item,{label:"Max Budget (USD)",name:"max_budget",children:(0,s.jsx)(Z.Z,{step:.01,precision:2,width:200})}),(0,s.jsx)(j.Z.Item,{label:"Duration (eg: 30s, 30h, 30d)",name:"duration",children:(0,s.jsx)(y.Z,{})}),(0,s.jsx)(j.Z.Item,{label:"Metadata",name:"metadata",children:(0,s.jsx)(y.Z.TextArea,{rows:4,placeholder:"Enter metadata as JSON"})})]}):(0,s.jsxs)(s.Fragment,{children:[(0,s.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,s.jsx)(y.Z,{})}),(0,s.jsx)(j.Z.Item,{label:"Team ID (Contact Group)",name:"team_id",children:(0,s.jsx)(y.Z,{placeholder:"ai_team"})}),(0,s.jsx)(j.Z.Item,{label:"Description",name:"description",children:(0,s.jsx)(y.Z.TextArea,{placeholder:"Enter description",rows:4})})]}),(0,s.jsx)("div",{style:{textAlign:"right",marginTop:"10px"},children:(0,s.jsx)(g.ZP,{htmlType:"submit",children:"Create Key"})})]})}),w&&(0,s.jsx)(p.Z,{title:"Save your key",visible:f,onOk:S,onCancel:N,footer:null,children:(0,s.jsxs)(h.Z,{numItems:1,className:"gap-2 w-full",children:[(0,s.jsx)(d.Z,{numColSpan:1,children:(0,s.jsxs)("p",{children:["Please save this secret key somewhere safe and accessible. For security reasons, ",(0,s.jsx)("b",{children:"you will not be able to view it again"})," ","through your LiteLLM account. If you lose this secret key, you will need to generate a new one."]})}),(0,s.jsx)(d.Z,{numColSpan:1,children:null!=w?(0,s.jsxs)(m.Z,{children:["API Key: ",w]}):(0,s.jsx)(m.Z,{children:"Key being created, this might take 30s"})})]})})]})},w=l(33393),b=l(13810),S=l(61244),N=l(10827),_=l(3851),D=l(2044),v=l(64167),C=l(74480),I=l(7178),E=l(42440),T=l(9853),A=l(67989),O=l(56863),P=e=>{let{token:t,accessToken:l,keySpend:a,keyBudget:n,keyName:o}=e,[i,d]=(0,r.useState)(!1),[h,m]=(0,r.useState)(null),[x,j]=(0,r.useState)(null),y=async()=>{try{if(null==l||null==t)return;console.log("accessToken: ".concat(l,"; token: ").concat(t));let e=await c(l,t);console.log("Response:",e);let s=Object.values(e).reduce((e,t)=>{let l=new Date(t.startTime),s=new Intl.DateTimeFormat("en-US",{day:"2-digit",month:"short"}).format(l);return e[s]=(e[s]||0)+t.spend,e},{}),r=Object.entries(s);r.sort((e,t)=>{let[l]=e,[s]=t,r=new Date(l),a=new Date(s);return r.getTime()-a.getTime()});let a=Object.fromEntries(r);console.log(a);let n=Object.values(e).reduce((e,t)=>{let l=t.user;return e[l]=(e[l]||0)+t.spend,e},{});console.log(s),console.log(n);let o=[];for(let[e,t]of Object.entries(a))o.push({day:e,spend:t});let i=Object.entries(n).sort((e,t)=>t[1]-e[1]).slice(0,5).map(e=>{let[t,l]=e;return{name:t,value:l}});m(o),j(i),console.log("arrayBarChart:",o)}catch(e){console.error("There was an error fetching the data",e)}};return t?(0,s.jsxs)("div",{children:[(0,s.jsx)(u.Z,{className:"mx-auto",onClick:()=>{console.log("Show Modal triggered"),d(!0),y()},children:"View Spend Report"}),(0,s.jsxs)(p.Z,{visible:i,width:1e3,onOk:()=>{d(!1)},onCancel:()=>{d(!1)},footer:null,children:[(0,s.jsxs)(E.Z,{style:{textAlign:"left"},children:["Key Name: ",o]}),(0,s.jsxs)(O.Z,{children:["Monthly Spend $",a]}),(0,s.jsx)(b.Z,{className:"mt-6 mb-6",children:h&&(0,s.jsx)(T.Z,{className:"mt-6",data:h,colors:["green"],index:"day",categories:["spend"],yAxisWidth:48})}),(0,s.jsx)(E.Z,{className:"mt-6",children:"Top 5 Users Spend (USD)"}),(0,s.jsx)(b.Z,{className:"mb-6",children:x&&(0,s.jsx)(A.Z,{className:"mt-6",data:x,color:"teal"})})]})]}):null},F=e=>{let{userID:t,accessToken:l,data:a,setData:n}=e,[i,c]=(0,r.useState)(!1),d=async e=>{if(null!=a)try{await o(l,e);let t=a.filter(t=>t.token!==e);n(t)}catch(e){console.error("Error deleting the key:",e)}};if(null!=a)return console.log("RERENDER TRIGGERED"),(0,s.jsxs)(b.Z,{className:"w-full mx-auto flex-auto overflow-y-auto max-h-[50vh] mb-4",children:[(0,s.jsx)(E.Z,{children:"API Keys"}),(0,s.jsxs)(N.Z,{className:"mt-5",children:[(0,s.jsx)(v.Z,{children:(0,s.jsxs)(I.Z,{children:[(0,s.jsx)(C.Z,{children:"Key Alias"}),(0,s.jsx)(C.Z,{children:"Secret Key"}),(0,s.jsx)(C.Z,{children:"Spend (USD)"}),(0,s.jsx)(C.Z,{children:"Key Budget (USD)"}),(0,s.jsx)(C.Z,{children:"Team ID"}),(0,s.jsx)(C.Z,{children:"Metadata"}),(0,s.jsx)(C.Z,{children:"Expires"})]})}),(0,s.jsx)(_.Z,{children:a.map(e=>(console.log(e),"litellm-dashboard"===e.team_id)?null:(0,s.jsxs)(I.Z,{children:[(0,s.jsx)(D.Z,{children:null!=e.key_alias?(0,s.jsx)(m.Z,{children:e.key_alias}):(0,s.jsx)(m.Z,{children:"Not Set"})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(m.Z,{children:e.key_name})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(m.Z,{children:e.spend})}),(0,s.jsx)(D.Z,{children:null!=e.max_budget?(0,s.jsx)(m.Z,{children:e.max_budget}):(0,s.jsx)(m.Z,{children:"Unlimited Budget"})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(m.Z,{children:e.team_id})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(m.Z,{children:JSON.stringify(e.metadata)})}),(0,s.jsx)(D.Z,{children:null!=e.expires?(0,s.jsx)(m.Z,{children:e.expires}):(0,s.jsx)(m.Z,{children:"Never expires"})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(S.Z,{onClick:()=>d(e.token),icon:w.Z,size:"sm"})}),(0,s.jsx)(D.Z,{children:(0,s.jsx)(P,{token:e.token,accessToken:l,keySpend:e.spend,keyBudget:e.max_budget,keyName:e.key_name})})]},e.token))})]})]})},R=e=>{let{userID:t,userSpendData:l}=e;console.log("User SpendData:",l);let r=null==l?void 0:l.spend,a=(null==l?void 0:l.max_budget)||null,n=null!==a?"$".concat(a," limit"):"No limit";return"$".concat(r," / ").concat(n),(0,s.jsx)(s.Fragment,{children:(0,s.jsxs)(b.Z,{className:"mx-auto mb-4",children:[(0,s.jsxs)(O.Z,{children:["$",r]}),(0,s.jsxs)(E.Z,{children:["/ ",n]})]})})},U=l(8792),K=e=>{let{userID:t,userRole:l,userEmail:r}=e;return console.log("User ID:",t),console.log("userEmail:",r),(0,s.jsxs)("nav",{className:"left-0 right-0 top-0 flex justify-between items-center h-12 mb-4",children:[(0,s.jsx)("div",{className:"text-left mx-4 my-2 absolute top-0 left-0",children:(0,s.jsx)("div",{className:"flex flex-col items-center",children:(0,s.jsx)(U.default,{href:"/",children:(0,s.jsx)("button",{className:"text-gray-800 text-2xl px-4 py-1 rounded text-center",children:"\uD83D\uDE85 LiteLLM"})})})}),(0,s.jsx)("div",{className:"text-right mx-4 my-2 absolute top-0 right-0",children:(0,s.jsxs)(u.Z,{variant:"secondary",children:[r,(0,s.jsxs)("p",{children:["Role: ",l]}),(0,s.jsxs)("p",{children:["ID: ",t]})]})})]})},L=l(47907),M=l(37963);console.log("isLocal:",!1);var B=()=>{let[e,t]=(0,r.useState)(null),[l,a]=(0,r.useState)(null),n=(0,L.useSearchParams)(),o=n.get("userID");n.get("viewSpend");let c=n.get("token"),[u,m]=(0,r.useState)(null),[x,j]=(0,r.useState)(null),[p,y]=(0,r.useState)(null);if((0,r.useEffect)(()=>{if(c){let e=(0,M.o)(c);if(e){if(console.log("Decoded token:",e),console.log("Decoded key:",e.key),m(e.key),e.user_role){let t=function(e){if(!e)return"Undefined Role";switch(console.log("Received user role: ".concat(e)),e.toLowerCase()){case"app_owner":case"demo_app_owner":return"App Owner";case"app_admin":return"Admin";case"app_user":return"App User";default:return"Unknown Role"}}(e.user_role);console.log("Decoded user_role:",t),j(t)}else console.log("User role not defined");e.user_email?y(e.user_email):console.log("User Email is not set ".concat(e))}}o&&u&&x&&!e&&(async()=>{try{let e=await i(u,o,x);a(e.user_info),t(e.keys)}catch(e){console.error("There was an error fetching the data",e)}})()},[o,c,u,e]),null==o||null==c){let e="/sso/key/generate";return console.log("Full URL:",e),window.location.href=e,null}return null==u?null:(null==x&&j("App Owner"),(0,s.jsxs)("div",{children:[(0,s.jsx)(K,{userID:o,userRole:x,userEmail:p}),(0,s.jsx)(h.Z,{numItems:1,className:"gap-0 p-10 h-[75vh] w-full",children:(0,s.jsxs)(d.Z,{numColSpan:1,children:[(0,s.jsx)(R,{userID:o,userSpendData:l}),(0,s.jsx)(F,{userID:o,accessToken:u,data:e,setData:t}),(0,s.jsx)(k,{userID:o,userRole:x,accessToken:u,data:e,setData:t})]})})]}))}}},function(e){e.O(0,[787,971,69,744],function(){return e(e.s=27376)}),_N_E=e.O()}]);
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-992f4cdd1053ee86.js b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
deleted file mode 100644
index cd4ccb43cb..0000000000
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
+++ /dev/null
@@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[931],{88102:function(e,t,l){Promise.resolve().then(l.bind(l,27680))},27680:function(e,t,l){"use strict";l.r(t),l.d(t,{default:function(){return M}});var r=l(3827),s=l(64090),n=l(80588);let a=async(e,t,l)=>{try{if(console.log("Form Values in keyCreateCall:",l),l.description&&(l.metadata||(l.metadata={}),l.metadata.description=l.description,delete l.description,l.metadata=JSON.stringify(l.metadata)),l.metadata){console.log("formValues.metadata:",l.metadata);try{l.metadata=JSON.parse(l.metadata)}catch(e){throw n.ZP.error("Failed to parse metadata: "+e),Error("Failed to parse metadata: "+e)}}console.log("Form Values after check:",l);let r=await fetch("/key/generate",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({user_id:t,...l})});if(!r.ok){let e=await r.text();throw n.ZP.error("Failed to create key: "+e),console.error("Error response from the server:",e),Error("Network response was not ok")}let s=await r.json();return console.log("API Response:",s),s}catch(e){throw console.error("Failed to create key:",e),e}},o=async(e,t)=>{try{console.log("in keyDeleteCall:",t);let l=await fetch("/key/delete",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({keys:[t]})});if(!l.ok){let e=await l.text();throw n.ZP.error("Failed to delete key: "+e),Error("Network response was not ok")}let r=await l.json();return console.log(r),n.ZP.success("API Key Deleted"),r}catch(e){throw console.error("Failed to create key:",e),e}},i=async(e,t)=>{try{let l="/user/info";console.log("in userInfoCall:",l);let r=await fetch("".concat(l,"/?user_id=").concat(t),{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!r.ok){let e=await r.text();throw n.ZP.error(e),Error("Network response was not ok")}let s=await r.json();return console.log(s),s}catch(e){throw console.error("Failed to create key:",e),e}},c=async(e,t)=>{try{let l="/spend/logs";console.log("in keySpendLogsCall:",l);let r=await fetch("".concat(l,"/?api_key=").concat(t),{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!r.ok){let e=await r.text();throw n.ZP.error(e),Error("Network response was not ok")}let s=await r.json();return console.log(s),s}catch(e){throw console.error("Failed to create key:",e),e}};var d=l(10384),h=l(46453),u=l(2179),m=l(71801),x=l(96776),j=l(2902),p=l(77171),y=l(29714),Z=l(88707),g=l(1861);let{Option:f}=x.default;var w=e=>{let{userID:t,userRole:l,accessToken:o,data:i,setData:c}=e,[x]=j.Z.useForm(),[f,w]=(0,s.useState)(!1),[k,b]=(0,s.useState)(null),N=()=>{w(!1),x.resetFields()},S=()=>{w(!1),b(null),x.resetFields()},_=async e=>{try{n.ZP.info("Making API Call"),e.models&&""!==e.models.trim()?e.models=e.models.split(",").map(e=>e.trim()):e.models=[],w(!0);let l=await a(o,t,e);c(e=>e?[...e,l]:[l]),b(l.key),n.ZP.success("API Key Created"),x.resetFields()}catch(e){console.error("Error creating the key:",e)}};return(0,r.jsxs)("div",{children:[(0,r.jsx)(u.Z,{className:"mx-auto",onClick:()=>w(!0),children:"+ Create New Key"}),(0,r.jsx)(p.Z,{title:"Create Key",visible:f,width:800,footer:null,onOk:N,onCancel:S,children:(0,r.jsxs)(j.Z,{form:x,onFinish:_,labelCol:{span:6},wrapperCol:{span:16},labelAlign:"left",children:["App Owner"===l||"Admin"===l?(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Team ID",name:"team_id",children:(0,r.jsx)(y.Z,{placeholder:"ai_team"})}),(0,r.jsx)(j.Z.Item,{label:"Models (Comma Separated). Eg: gpt-3.5-turbo,gpt-4",name:"models",children:(0,r.jsx)(y.Z,{placeholder:"gpt-4,gpt-3.5-turbo"})}),(0,r.jsx)(j.Z.Item,{label:"Max Budget (USD)",name:"max_budget",children:(0,r.jsx)(Z.Z,{step:.01,precision:2,width:200})}),(0,r.jsx)(j.Z.Item,{label:"Duration (eg: 30s, 30h, 30d)",name:"duration",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Metadata",name:"metadata",children:(0,r.jsx)(y.Z.TextArea,{rows:4,placeholder:"Enter metadata as JSON"})})]}):(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Team ID (Contact Group)",name:"team_id",children:(0,r.jsx)(y.Z,{placeholder:"ai_team"})}),(0,r.jsx)(j.Z.Item,{label:"Description",name:"description",children:(0,r.jsx)(y.Z.TextArea,{placeholder:"Enter description",rows:4})})]}),(0,r.jsx)("div",{style:{textAlign:"right",marginTop:"10px"},children:(0,r.jsx)(g.ZP,{htmlType:"submit",children:"Create Key"})})]})}),k&&(0,r.jsx)(p.Z,{title:"Save your key",visible:f,onOk:N,onCancel:S,footer:null,children:(0,r.jsxs)(h.Z,{numItems:1,className:"gap-2 w-full",children:[(0,r.jsx)(d.Z,{numColSpan:1,children:(0,r.jsxs)("p",{children:["Please save this secret key somewhere safe and accessible. For security reasons, ",(0,r.jsx)("b",{children:"you will not be able to view it again"})," ","through your LiteLLM account. If you lose this secret key, you will need to generate a new one."]})}),(0,r.jsx)(d.Z,{numColSpan:1,children:null!=k?(0,r.jsxs)(m.Z,{children:["API Key: ",k]}):(0,r.jsx)(m.Z,{children:"Key being created, this might take 30s"})})]})})]})},k=l(33393),b=l(13810),N=l(61244),S=l(10827),_=l(3851),D=l(2044),C=l(64167),v=l(74480),I=l(7178),E=l(42440),T=l(9853),A=l(67989),O=l(56863),F=e=>{let{token:t,accessToken:l,keySpend:n,keyBudget:a,keyName:o}=e,[i,d]=(0,s.useState)(!1),[h,m]=(0,s.useState)(null),[x,j]=(0,s.useState)(null),y=async()=>{try{if(null==l||null==t)return;let e=await c(l,t);console.log("Response:",e);let r=Object.values(e).reduce((e,t)=>{let l=new Date(t.startTime),r=new Intl.DateTimeFormat("en-US",{day:"2-digit",month:"short"}).format(l);return e[r]=(e[r]||0)+t.spend,e},{}),s=Object.entries(r);s.sort((e,t)=>{let[l]=e,[r]=t,s=new Date(l),n=new Date(r);return s.getTime()-n.getTime()});let n=Object.fromEntries(s);console.log(n);let a=Object.values(e).reduce((e,t)=>{let l=t.user;return e[l]=(e[l]||0)+t.spend,e},{});console.log(r),console.log(a);let o=[];for(let[e,t]of Object.entries(n))o.push({day:e,spend:t});let i=Object.entries(a).sort((e,t)=>t[1]-e[1]).slice(0,5).map(e=>{let[t,l]=e;return{name:t,value:l}});m(o),j(i),console.log("arrayBarChart:",o)}catch(e){console.error("There was an error fetching the data",e)}};return((0,s.useEffect)(()=>{y()},[t]),t)?(0,r.jsxs)("div",{children:[(0,r.jsx)(u.Z,{className:"mx-auto",onClick:()=>{d(!0)},children:"View Spend Report"}),(0,r.jsxs)(p.Z,{visible:i,width:1e3,onOk:()=>{d(!1)},onCancel:()=>{d(!1)},footer:null,children:[(0,r.jsxs)(E.Z,{style:{textAlign:"left"},children:["Key Name: ",o]}),(0,r.jsxs)(O.Z,{children:["Monthly Spend $",n]}),(0,r.jsx)(b.Z,{className:"mt-6 mb-6",children:h&&(0,r.jsx)(T.Z,{className:"mt-6",data:h,colors:["green"],index:"day",categories:["spend"],yAxisWidth:48})}),(0,r.jsx)(E.Z,{className:"mt-6",children:"Top 5 Users Spend (USD)"}),(0,r.jsx)(b.Z,{className:"mb-6",children:x&&(0,r.jsx)(A.Z,{className:"mt-6",data:x,color:"teal"})})]})]}):null},P=e=>{let{userID:t,accessToken:l,data:s,setData:n}=e,a=async e=>{if(null!=s)try{await o(l,e);let t=s.filter(t=>t.token!==e);n(t)}catch(e){console.error("Error deleting the key:",e)}};if(null!=s)return console.log("RERENDER TRIGGERED"),(0,r.jsxs)(b.Z,{className:"w-full mx-auto flex-auto overflow-y-auto max-h-[50vh] mb-4",children:[(0,r.jsx)(E.Z,{children:"API Keys"}),(0,r.jsxs)(S.Z,{className:"mt-5",children:[(0,r.jsx)(C.Z,{children:(0,r.jsxs)(I.Z,{children:[(0,r.jsx)(v.Z,{children:"Key Alias"}),(0,r.jsx)(v.Z,{children:"Secret Key"}),(0,r.jsx)(v.Z,{children:"Spend (USD)"}),(0,r.jsx)(v.Z,{children:"Key Budget (USD)"}),(0,r.jsx)(v.Z,{children:"Team ID"}),(0,r.jsx)(v.Z,{children:"Metadata"}),(0,r.jsx)(v.Z,{children:"Expires"})]})}),(0,r.jsx)(_.Z,{children:s.map(e=>(console.log(e),"litellm-dashboard"===e.team_id)?null:(0,r.jsxs)(I.Z,{children:[(0,r.jsx)(D.Z,{children:null!=e.key_alias?(0,r.jsx)(m.Z,{children:e.key_alias}):(0,r.jsx)(m.Z,{children:"Not Set"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.key_name})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.spend})}),(0,r.jsx)(D.Z,{children:null!=e.max_budget?(0,r.jsx)(m.Z,{children:e.max_budget}):(0,r.jsx)(m.Z,{children:"Unlimited Budget"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.team_id})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:JSON.stringify(e.metadata)})}),(0,r.jsx)(D.Z,{children:null!=e.expires?(0,r.jsx)(m.Z,{children:e.expires}):(0,r.jsx)(m.Z,{children:"Never expires"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(N.Z,{onClick:()=>a(e.token),icon:k.Z,size:"sm"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(F,{token:e.token,accessToken:l,keySpend:e.spend,keyBudget:e.max_budget,keyName:e.key_name})})]},e.token))})]})]})},R=e=>{let{userID:t,userSpendData:l}=e;console.log("User SpendData:",l);let s=null==l?void 0:l.spend,n=(null==l?void 0:l.max_budget)||null,a=null!==n?"$".concat(n," limit"):"No limit";return"$".concat(s," / ").concat(a),(0,r.jsx)(r.Fragment,{children:(0,r.jsxs)(b.Z,{className:"mx-auto mb-4",children:[(0,r.jsxs)(O.Z,{children:["$",s]}),(0,r.jsxs)(E.Z,{children:["/ ",a]})]})})},K=l(8792),U=e=>{let{userID:t,userRole:l}=e;return console.log("User ID:",t),(0,r.jsxs)("nav",{className:"left-0 right-0 top-0 flex justify-between items-center h-12 mb-4",children:[(0,r.jsx)("div",{className:"text-left mx-4 my-2 absolute top-0 left-0",children:(0,r.jsx)("div",{className:"flex flex-col items-center",children:(0,r.jsx)(K.default,{href:"/",children:(0,r.jsx)("button",{className:"text-gray-800 text-2xl px-4 py-1 rounded text-center",children:"\uD83D\uDE85 LiteLLM"})})})}),(0,r.jsx)("div",{className:"text-right mx-4 my-2 absolute top-0 right-0",children:(0,r.jsxs)(u.Z,{variant:"secondary",children:[t,(0,r.jsxs)("p",{children:["Role: ",l]})]})})]})},B=l(47907),L=l(37963),M=()=>{let[e,t]=(0,s.useState)(null),[l,n]=(0,s.useState)(null),a=(0,B.useSearchParams)(),o=a.get("userID");a.get("viewSpend");let c=a.get("token"),[u,m]=(0,s.useState)(null),[x,j]=(0,s.useState)(null);if((0,s.useEffect)(()=>{if(c){let e=(0,L.o)(c);if(e){if(console.log("Decoded token:",e),console.log("Decoded key:",e.key),m(e.key),e.user_role){let t=function(e){if(!e)return"Undefined Role";switch(e.toLowerCase()){case"app_owner":return"App Owner";case"demo_app_owner":return"AppOwner";case"admin":return"Admin";case"app_user":return"App User";default:return"Unknown Role"}}(e.user_role);console.log("Decoded user_role:",t),j(t)}else console.log("User role not defined")}}o&&u&&!e&&(async()=>{try{let e=await i(u,o);console.log("Response:",e),n(e.user_info),t(e.keys)}catch(e){console.error("There was an error fetching the data",e)}})()},[o,c,u,e]),null==o||null==c){let e="/sso/key/generate";return console.log("Full URL:",e),window.location.href=e,null}return null==u?null:(null==x&&j("App Owner"),(0,r.jsxs)("div",{children:[(0,r.jsx)(U,{userID:o,userRole:x}),(0,r.jsx)(h.Z,{numItems:1,className:"gap-0 p-10 h-[75vh] w-full",children:(0,r.jsxs)(d.Z,{numColSpan:1,children:[(0,r.jsx)(R,{userID:o,userSpendData:l}),(0,r.jsx)(P,{userID:o,accessToken:u,data:e,setData:t}),(0,r.jsx)(w,{userID:o,userRole:x,accessToken:u,data:e,setData:t})]})})]}))}}},function(e){e.O(0,[787,971,69,744],function(){return e(e.s=88102)}),_N_E=e.O()}]);
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
similarity index 54%
rename from ui/litellm-dashboard/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
rename to litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
index 440df3cb37..421ae3e2c5 100644
--- a/ui/litellm-dashboard/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
@@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
\ No newline at end of file
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
deleted file mode 100644
index 440df3cb37..0000000000
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
+++ /dev/null
@@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js b/litellm/proxy/_experimental/out/_next/static/p5gDwQBbgW8D3Uz3lgoZg/_buildManifest.js
similarity index 100%
rename from litellm/proxy/_experimental/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js
rename to litellm/proxy/_experimental/out/_next/static/p5gDwQBbgW8D3Uz3lgoZg/_buildManifest.js
diff --git a/litellm/proxy/_experimental/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js b/litellm/proxy/_experimental/out/_next/static/p5gDwQBbgW8D3Uz3lgoZg/_ssgManifest.js
similarity index 100%
rename from litellm/proxy/_experimental/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js
rename to litellm/proxy/_experimental/out/_next/static/p5gDwQBbgW8D3Uz3lgoZg/_ssgManifest.js
diff --git a/litellm/proxy/_experimental/out/index.html b/litellm/proxy/_experimental/out/index.html
index f0fb6f14c3..9537fb7232 100644
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@@ -1 +1 @@
-<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/787-5bb33960644f5c7c.js" async=""></script><script src="/ui/_next/static/chunks/app/page-992f4cdd1053ee86.js" async=""></script><title>Create Next App</title><meta name="description" content="Generated by create next app"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><!--$!--><template data-dgst="BAILOUT_TO_CLIENT_SIDE_RENDERING"></template><div>Loading...</div><!--/$--><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:\"$Sreact.suspense\"\n7:I[27680,[\"787\",\"static/chunks/787-5bb33960644f5c7c.js\",\"931\",\"static/chunks/app/page-992f4cdd1053ee86.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"lGjwnJSGwBqa476jHHI8W\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$6\",null,{\"fallback\":[\"$\",\"div\",null,{\"children\":\"Loading...\"}],\"children\":[\"$\",\"div\",null,{\"className\":\"flex min-h-screen flex-col \",\"children\":[\"$\",\"$L7\",null,{}]}]}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"Create Next App\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Generated by create next app\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
+<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/787-5bb33960644f5c7c.js" async=""></script><script src="/ui/_next/static/chunks/app/page-7f03ccc8529ada97.js" async=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><!--$!--><template data-dgst="BAILOUT_TO_CLIENT_SIDE_RENDERING"></template><div>Loading...</div><!--/$--><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:\"$Sreact.suspense\"\n7:I[27680,[\"787\",\"static/chunks/787-5bb33960644f5c7c.js\",\"931\",\"static/chunks/app/page-7f03ccc8529ada97.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"p5gDwQBbgW8D3Uz3lgoZg\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$6\",null,{\"fallback\":[\"$\",\"div\",null,{\"children\":\"Loading...\"}],\"children\":[\"$\",\"div\",null,{\"className\":\"flex min-h-screen flex-col \",\"children\":[\"$\",\"$L7\",null,{}]}]}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
diff --git a/litellm/proxy/_experimental/out/index.txt b/litellm/proxy/_experimental/out/index.txt
index db2a9c631b..f48954f2c4 100644
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@@ -1,7 +1,7 @@
 2:"$Sreact.suspense"
-3:I[27680,["787","static/chunks/787-5bb33960644f5c7c.js","931","static/chunks/app/page-992f4cdd1053ee86.js"],""]
+3:I[27680,["787","static/chunks/787-5bb33960644f5c7c.js","931","static/chunks/app/page-7f03ccc8529ada97.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["lGjwnJSGwBqa476jHHI8W",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col ","children":["$","$L3",null,{}]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a6a9860a7fe022a9.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
-6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
+0:["p5gDwQBbgW8D3Uz3lgoZg",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col ","children":["$","$L3",null,{}]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a6a9860a7fe022a9.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 3b8b5a3b32..95c2f2ccb9 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1889,6 +1889,7 @@ async def startup_event():
         user_id = "default_user_id"
         if os.getenv("PROXY_ADMIN_ID", None) is not None:
             user_id = os.getenv("PROXY_ADMIN_ID")
+
         asyncio.create_task(
             generate_key_helper_fn(
                 duration=None,
@@ -1899,6 +1900,10 @@ async def startup_event():
                 token=master_key,
                 user_id=user_id,
                 user_role="proxy_admin",
+                query_type="update_data",
+                update_key_values={
+                    "user_role": "proxy_admin",
+                },
             )
         )
 
@@ -3461,7 +3466,6 @@ async def auth_callback(request: Request):
     response = await generate_key_helper_fn(
         **{"duration": "1hr", "key_max_budget": 0, "models": [], "aliases": {}, "config": {}, "spend": 0, "user_id": user_id, "team_id": "litellm-dashboard", "user_email": user_email}  # type: ignore
     )
-
     key = response["token"]  # type: ignore
     user_id = response["user_id"]  # type: ignore
 
diff --git a/ui/litellm-dashboard/out/404.html b/ui/litellm-dashboard/out/404.html
index 1ec6cd9a4f..c57eb5193a 100644
--- a/ui/litellm-dashboard/out/404.html
+++ b/ui/litellm-dashboard/out/404.html
@@ -1 +1 @@
-<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>404: This page could not be found.</title><title>Create Next App</title><meta name="description" content="Generated by create next app"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[5613,[],\"\"]\n7:I[31778,[],\"\"]\nd:I[48955,[],\"\"]\n8:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n9:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\na:{\"display\":\"inline-block\"}\nb:{\"fontSize\":14,\"fontWe"])</script><script>self.__next_f.push([1,"ight\":400,\"lineHeight\":\"49px\",\"margin\":0}\ne:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"lGjwnJSGwBqa476jHHI8W\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$9\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$a\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$b\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$Lc\"],\"globalErrorComponent\":\"$d\",\"missingSlots\":\"$We\"}]]\n"])</script><script>self.__next_f.push([1,"c:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"Create Next App\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Generated by create next app\"}],[\"$\",\"meta\",\"4\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
+<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>404: This page could not be found.</title><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding:0 23px 0 0;font-size:24px;font-weight:500;vertical-align:top;line-height:49px">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:49px;margin:0">This page could not be found.</h2></div></div></div><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[5613,[],\"\"]\n7:I[31778,[],\"\"]\nd:I[48955,[],\"\"]\n8:{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"}\n9:{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"}\na:{\"display\":\"inline-block\"}\nb:{\"fontSize\":14,\"fontWe"])</script><script>self.__next_f.push([1,"ight\":400,\"lineHeight\":\"49px\",\"margin\":0}\ne:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"p5gDwQBbgW8D3Uz3lgoZg\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/_not-found\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L6\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L7\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":\"$8\",\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":\"$9\",\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":\"$a\",\"children\":[\"$\",\"h2\",null,{\"style\":\"$b\",\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$Lc\"],\"globalErrorComponent\":\"$d\",\"missingSlots\":\"$We\"}]]\n"])</script><script>self.__next_f.push([1,"c:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"meta\",\"4\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/_next/static/chunks/app/layout-4d667c133e03c98b.js b/ui/litellm-dashboard/out/_next/static/chunks/app/layout-4d667c133e03c98b.js
deleted file mode 100644
index e261adc052..0000000000
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/layout-4d667c133e03c98b.js
+++ /dev/null
@@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/_next/static/chunks/app/page-992f4cdd1053ee86.js b/ui/litellm-dashboard/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
deleted file mode 100644
index cd4ccb43cb..0000000000
--- a/ui/litellm-dashboard/out/_next/static/chunks/app/page-992f4cdd1053ee86.js
+++ /dev/null
@@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[931],{88102:function(e,t,l){Promise.resolve().then(l.bind(l,27680))},27680:function(e,t,l){"use strict";l.r(t),l.d(t,{default:function(){return M}});var r=l(3827),s=l(64090),n=l(80588);let a=async(e,t,l)=>{try{if(console.log("Form Values in keyCreateCall:",l),l.description&&(l.metadata||(l.metadata={}),l.metadata.description=l.description,delete l.description,l.metadata=JSON.stringify(l.metadata)),l.metadata){console.log("formValues.metadata:",l.metadata);try{l.metadata=JSON.parse(l.metadata)}catch(e){throw n.ZP.error("Failed to parse metadata: "+e),Error("Failed to parse metadata: "+e)}}console.log("Form Values after check:",l);let r=await fetch("/key/generate",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({user_id:t,...l})});if(!r.ok){let e=await r.text();throw n.ZP.error("Failed to create key: "+e),console.error("Error response from the server:",e),Error("Network response was not ok")}let s=await r.json();return console.log("API Response:",s),s}catch(e){throw console.error("Failed to create key:",e),e}},o=async(e,t)=>{try{console.log("in keyDeleteCall:",t);let l=await fetch("/key/delete",{method:"POST",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"},body:JSON.stringify({keys:[t]})});if(!l.ok){let e=await l.text();throw n.ZP.error("Failed to delete key: "+e),Error("Network response was not ok")}let r=await l.json();return console.log(r),n.ZP.success("API Key Deleted"),r}catch(e){throw console.error("Failed to create key:",e),e}},i=async(e,t)=>{try{let l="/user/info";console.log("in userInfoCall:",l);let r=await fetch("".concat(l,"/?user_id=").concat(t),{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!r.ok){let e=await r.text();throw n.ZP.error(e),Error("Network response was not ok")}let s=await r.json();return console.log(s),s}catch(e){throw console.error("Failed to create key:",e),e}},c=async(e,t)=>{try{let l="/spend/logs";console.log("in keySpendLogsCall:",l);let r=await fetch("".concat(l,"/?api_key=").concat(t),{method:"GET",headers:{Authorization:"Bearer ".concat(e),"Content-Type":"application/json"}});if(!r.ok){let e=await r.text();throw n.ZP.error(e),Error("Network response was not ok")}let s=await r.json();return console.log(s),s}catch(e){throw console.error("Failed to create key:",e),e}};var d=l(10384),h=l(46453),u=l(2179),m=l(71801),x=l(96776),j=l(2902),p=l(77171),y=l(29714),Z=l(88707),g=l(1861);let{Option:f}=x.default;var w=e=>{let{userID:t,userRole:l,accessToken:o,data:i,setData:c}=e,[x]=j.Z.useForm(),[f,w]=(0,s.useState)(!1),[k,b]=(0,s.useState)(null),N=()=>{w(!1),x.resetFields()},S=()=>{w(!1),b(null),x.resetFields()},_=async e=>{try{n.ZP.info("Making API Call"),e.models&&""!==e.models.trim()?e.models=e.models.split(",").map(e=>e.trim()):e.models=[],w(!0);let l=await a(o,t,e);c(e=>e?[...e,l]:[l]),b(l.key),n.ZP.success("API Key Created"),x.resetFields()}catch(e){console.error("Error creating the key:",e)}};return(0,r.jsxs)("div",{children:[(0,r.jsx)(u.Z,{className:"mx-auto",onClick:()=>w(!0),children:"+ Create New Key"}),(0,r.jsx)(p.Z,{title:"Create Key",visible:f,width:800,footer:null,onOk:N,onCancel:S,children:(0,r.jsxs)(j.Z,{form:x,onFinish:_,labelCol:{span:6},wrapperCol:{span:16},labelAlign:"left",children:["App Owner"===l||"Admin"===l?(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Team ID",name:"team_id",children:(0,r.jsx)(y.Z,{placeholder:"ai_team"})}),(0,r.jsx)(j.Z.Item,{label:"Models (Comma Separated). Eg: gpt-3.5-turbo,gpt-4",name:"models",children:(0,r.jsx)(y.Z,{placeholder:"gpt-4,gpt-3.5-turbo"})}),(0,r.jsx)(j.Z.Item,{label:"Max Budget (USD)",name:"max_budget",children:(0,r.jsx)(Z.Z,{step:.01,precision:2,width:200})}),(0,r.jsx)(j.Z.Item,{label:"Duration (eg: 30s, 30h, 30d)",name:"duration",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Metadata",name:"metadata",children:(0,r.jsx)(y.Z.TextArea,{rows:4,placeholder:"Enter metadata as JSON"})})]}):(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(j.Z.Item,{label:"Key Name",name:"key_alias",children:(0,r.jsx)(y.Z,{})}),(0,r.jsx)(j.Z.Item,{label:"Team ID (Contact Group)",name:"team_id",children:(0,r.jsx)(y.Z,{placeholder:"ai_team"})}),(0,r.jsx)(j.Z.Item,{label:"Description",name:"description",children:(0,r.jsx)(y.Z.TextArea,{placeholder:"Enter description",rows:4})})]}),(0,r.jsx)("div",{style:{textAlign:"right",marginTop:"10px"},children:(0,r.jsx)(g.ZP,{htmlType:"submit",children:"Create Key"})})]})}),k&&(0,r.jsx)(p.Z,{title:"Save your key",visible:f,onOk:N,onCancel:S,footer:null,children:(0,r.jsxs)(h.Z,{numItems:1,className:"gap-2 w-full",children:[(0,r.jsx)(d.Z,{numColSpan:1,children:(0,r.jsxs)("p",{children:["Please save this secret key somewhere safe and accessible. For security reasons, ",(0,r.jsx)("b",{children:"you will not be able to view it again"})," ","through your LiteLLM account. If you lose this secret key, you will need to generate a new one."]})}),(0,r.jsx)(d.Z,{numColSpan:1,children:null!=k?(0,r.jsxs)(m.Z,{children:["API Key: ",k]}):(0,r.jsx)(m.Z,{children:"Key being created, this might take 30s"})})]})})]})},k=l(33393),b=l(13810),N=l(61244),S=l(10827),_=l(3851),D=l(2044),C=l(64167),v=l(74480),I=l(7178),E=l(42440),T=l(9853),A=l(67989),O=l(56863),F=e=>{let{token:t,accessToken:l,keySpend:n,keyBudget:a,keyName:o}=e,[i,d]=(0,s.useState)(!1),[h,m]=(0,s.useState)(null),[x,j]=(0,s.useState)(null),y=async()=>{try{if(null==l||null==t)return;let e=await c(l,t);console.log("Response:",e);let r=Object.values(e).reduce((e,t)=>{let l=new Date(t.startTime),r=new Intl.DateTimeFormat("en-US",{day:"2-digit",month:"short"}).format(l);return e[r]=(e[r]||0)+t.spend,e},{}),s=Object.entries(r);s.sort((e,t)=>{let[l]=e,[r]=t,s=new Date(l),n=new Date(r);return s.getTime()-n.getTime()});let n=Object.fromEntries(s);console.log(n);let a=Object.values(e).reduce((e,t)=>{let l=t.user;return e[l]=(e[l]||0)+t.spend,e},{});console.log(r),console.log(a);let o=[];for(let[e,t]of Object.entries(n))o.push({day:e,spend:t});let i=Object.entries(a).sort((e,t)=>t[1]-e[1]).slice(0,5).map(e=>{let[t,l]=e;return{name:t,value:l}});m(o),j(i),console.log("arrayBarChart:",o)}catch(e){console.error("There was an error fetching the data",e)}};return((0,s.useEffect)(()=>{y()},[t]),t)?(0,r.jsxs)("div",{children:[(0,r.jsx)(u.Z,{className:"mx-auto",onClick:()=>{d(!0)},children:"View Spend Report"}),(0,r.jsxs)(p.Z,{visible:i,width:1e3,onOk:()=>{d(!1)},onCancel:()=>{d(!1)},footer:null,children:[(0,r.jsxs)(E.Z,{style:{textAlign:"left"},children:["Key Name: ",o]}),(0,r.jsxs)(O.Z,{children:["Monthly Spend $",n]}),(0,r.jsx)(b.Z,{className:"mt-6 mb-6",children:h&&(0,r.jsx)(T.Z,{className:"mt-6",data:h,colors:["green"],index:"day",categories:["spend"],yAxisWidth:48})}),(0,r.jsx)(E.Z,{className:"mt-6",children:"Top 5 Users Spend (USD)"}),(0,r.jsx)(b.Z,{className:"mb-6",children:x&&(0,r.jsx)(A.Z,{className:"mt-6",data:x,color:"teal"})})]})]}):null},P=e=>{let{userID:t,accessToken:l,data:s,setData:n}=e,a=async e=>{if(null!=s)try{await o(l,e);let t=s.filter(t=>t.token!==e);n(t)}catch(e){console.error("Error deleting the key:",e)}};if(null!=s)return console.log("RERENDER TRIGGERED"),(0,r.jsxs)(b.Z,{className:"w-full mx-auto flex-auto overflow-y-auto max-h-[50vh] mb-4",children:[(0,r.jsx)(E.Z,{children:"API Keys"}),(0,r.jsxs)(S.Z,{className:"mt-5",children:[(0,r.jsx)(C.Z,{children:(0,r.jsxs)(I.Z,{children:[(0,r.jsx)(v.Z,{children:"Key Alias"}),(0,r.jsx)(v.Z,{children:"Secret Key"}),(0,r.jsx)(v.Z,{children:"Spend (USD)"}),(0,r.jsx)(v.Z,{children:"Key Budget (USD)"}),(0,r.jsx)(v.Z,{children:"Team ID"}),(0,r.jsx)(v.Z,{children:"Metadata"}),(0,r.jsx)(v.Z,{children:"Expires"})]})}),(0,r.jsx)(_.Z,{children:s.map(e=>(console.log(e),"litellm-dashboard"===e.team_id)?null:(0,r.jsxs)(I.Z,{children:[(0,r.jsx)(D.Z,{children:null!=e.key_alias?(0,r.jsx)(m.Z,{children:e.key_alias}):(0,r.jsx)(m.Z,{children:"Not Set"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.key_name})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.spend})}),(0,r.jsx)(D.Z,{children:null!=e.max_budget?(0,r.jsx)(m.Z,{children:e.max_budget}):(0,r.jsx)(m.Z,{children:"Unlimited Budget"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:e.team_id})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(m.Z,{children:JSON.stringify(e.metadata)})}),(0,r.jsx)(D.Z,{children:null!=e.expires?(0,r.jsx)(m.Z,{children:e.expires}):(0,r.jsx)(m.Z,{children:"Never expires"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(N.Z,{onClick:()=>a(e.token),icon:k.Z,size:"sm"})}),(0,r.jsx)(D.Z,{children:(0,r.jsx)(F,{token:e.token,accessToken:l,keySpend:e.spend,keyBudget:e.max_budget,keyName:e.key_name})})]},e.token))})]})]})},R=e=>{let{userID:t,userSpendData:l}=e;console.log("User SpendData:",l);let s=null==l?void 0:l.spend,n=(null==l?void 0:l.max_budget)||null,a=null!==n?"$".concat(n," limit"):"No limit";return"$".concat(s," / ").concat(a),(0,r.jsx)(r.Fragment,{children:(0,r.jsxs)(b.Z,{className:"mx-auto mb-4",children:[(0,r.jsxs)(O.Z,{children:["$",s]}),(0,r.jsxs)(E.Z,{children:["/ ",a]})]})})},K=l(8792),U=e=>{let{userID:t,userRole:l}=e;return console.log("User ID:",t),(0,r.jsxs)("nav",{className:"left-0 right-0 top-0 flex justify-between items-center h-12 mb-4",children:[(0,r.jsx)("div",{className:"text-left mx-4 my-2 absolute top-0 left-0",children:(0,r.jsx)("div",{className:"flex flex-col items-center",children:(0,r.jsx)(K.default,{href:"/",children:(0,r.jsx)("button",{className:"text-gray-800 text-2xl px-4 py-1 rounded text-center",children:"\uD83D\uDE85 LiteLLM"})})})}),(0,r.jsx)("div",{className:"text-right mx-4 my-2 absolute top-0 right-0",children:(0,r.jsxs)(u.Z,{variant:"secondary",children:[t,(0,r.jsxs)("p",{children:["Role: ",l]})]})})]})},B=l(47907),L=l(37963),M=()=>{let[e,t]=(0,s.useState)(null),[l,n]=(0,s.useState)(null),a=(0,B.useSearchParams)(),o=a.get("userID");a.get("viewSpend");let c=a.get("token"),[u,m]=(0,s.useState)(null),[x,j]=(0,s.useState)(null);if((0,s.useEffect)(()=>{if(c){let e=(0,L.o)(c);if(e){if(console.log("Decoded token:",e),console.log("Decoded key:",e.key),m(e.key),e.user_role){let t=function(e){if(!e)return"Undefined Role";switch(e.toLowerCase()){case"app_owner":return"App Owner";case"demo_app_owner":return"AppOwner";case"admin":return"Admin";case"app_user":return"App User";default:return"Unknown Role"}}(e.user_role);console.log("Decoded user_role:",t),j(t)}else console.log("User role not defined")}}o&&u&&!e&&(async()=>{try{let e=await i(u,o);console.log("Response:",e),n(e.user_info),t(e.keys)}catch(e){console.error("There was an error fetching the data",e)}})()},[o,c,u,e]),null==o||null==c){let e="/sso/key/generate";return console.log("Full URL:",e),window.location.href=e,null}return null==u?null:(null==x&&j("App Owner"),(0,r.jsxs)("div",{children:[(0,r.jsx)(U,{userID:o,userRole:x}),(0,r.jsx)(h.Z,{numItems:1,className:"gap-0 p-10 h-[75vh] w-full",children:(0,r.jsxs)(d.Z,{numColSpan:1,children:[(0,r.jsx)(R,{userID:o,userSpendData:l}),(0,r.jsx)(P,{userID:o,accessToken:u,data:e,setData:t}),(0,r.jsx)(w,{userID:o,userRole:x,accessToken:u,data:e,setData:t})]})})]}))}}},function(e){e.O(0,[787,971,69,744],function(){return e(e.s=88102)}),_N_E=e.O()}]);
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js b/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js
deleted file mode 100644
index f779caa02f..0000000000
--- a/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_buildManifest.js
+++ /dev/null
@@ -1 +0,0 @@
-self.__BUILD_MANIFEST={__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/_error":["static/chunks/pages/_error-d6107f1aac0c574c.js"],sortedPages:["/_app","/_error"]},self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js b/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js
deleted file mode 100644
index 5b3ff592fd..0000000000
--- a/ui/litellm-dashboard/out/_next/static/lGjwnJSGwBqa476jHHI8W/_ssgManifest.js
+++ /dev/null
@@ -1 +0,0 @@
-self.__SSG_MANIFEST=new Set([]);self.__SSG_MANIFEST_CB&&self.__SSG_MANIFEST_CB()
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/index.html b/ui/litellm-dashboard/out/index.html
index f0fb6f14c3..9537fb7232 100644
--- a/ui/litellm-dashboard/out/index.html
+++ b/ui/litellm-dashboard/out/index.html
@@ -1 +1 @@
-<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/787-5bb33960644f5c7c.js" async=""></script><script src="/ui/_next/static/chunks/app/page-992f4cdd1053ee86.js" async=""></script><title>Create Next App</title><meta name="description" content="Generated by create next app"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><!--$!--><template data-dgst="BAILOUT_TO_CLIENT_SIDE_RENDERING"></template><div>Loading...</div><!--/$--><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:\"$Sreact.suspense\"\n7:I[27680,[\"787\",\"static/chunks/787-5bb33960644f5c7c.js\",\"931\",\"static/chunks/app/page-992f4cdd1053ee86.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"lGjwnJSGwBqa476jHHI8W\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$6\",null,{\"fallback\":[\"$\",\"div\",null,{\"children\":\"Loading...\"}],\"children\":[\"$\",\"div\",null,{\"className\":\"flex min-h-screen flex-col \",\"children\":[\"$\",\"$L7\",null,{}]}]}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"Create Next App\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"Generated by create next app\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
+<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" href="/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2" as="font" crossorigin="" type="font/woff2"/><link rel="stylesheet" href="/ui/_next/static/css/a6a9860a7fe022a9.css" crossorigin="" data-precedence="next"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-0fdafad9fb42c60e.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-4e4042a4141d3777.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/787-5bb33960644f5c7c.js" async=""></script><script src="/ui/_next/static/chunks/app/page-7f03ccc8529ada97.js" async=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body class="__className_c23dc8"><!--$!--><template data-dgst="BAILOUT_TO_CLIENT_SIDE_RENDERING"></template><div>Loading...</div><!--/$--><script src="/ui/_next/static/chunks/webpack-a8495ac943acd6c4.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:\"$Sreact.suspense\"\n7:I[27680,[\"787\",\"static/chunks/787-5bb33960644f5c7c.js\",\"931\",\"static/chunks/app/page-7f03ccc8529ada97.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a6a9860a7fe022a9.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"p5gDwQBbgW8D3Uz3lgoZg\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$6\",null,{\"fallback\":[\"$\",\"div\",null,{\"children\":\"Loading...\"}],\"children\":[\"$\",\"div\",null,{\"className\":\"flex min-h-screen flex-col \",\"children\":[\"$\",\"$L7\",null,{}]}]}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
\ No newline at end of file
diff --git a/ui/litellm-dashboard/out/index.txt b/ui/litellm-dashboard/out/index.txt
index db2a9c631b..f48954f2c4 100644
--- a/ui/litellm-dashboard/out/index.txt
+++ b/ui/litellm-dashboard/out/index.txt
@@ -1,7 +1,7 @@
 2:"$Sreact.suspense"
-3:I[27680,["787","static/chunks/787-5bb33960644f5c7c.js","931","static/chunks/app/page-992f4cdd1053ee86.js"],""]
+3:I[27680,["787","static/chunks/787-5bb33960644f5c7c.js","931","static/chunks/app/page-7f03ccc8529ada97.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["lGjwnJSGwBqa476jHHI8W",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col ","children":["$","$L3",null,{}]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a6a9860a7fe022a9.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
-6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"Create Next App"}],["$","meta","3",{"name":"description","content":"Generated by create next app"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
+0:["p5gDwQBbgW8D3Uz3lgoZg",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$2",null,{"fallback":["$","div",null,{"children":"Loading..."}],"children":["$","div",null,{"className":"flex min-h-screen flex-col ","children":["$","$L3",null,{}]}]}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a6a9860a7fe022a9.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
diff --git a/ui/litellm-dashboard/src/components/networking.tsx b/ui/litellm-dashboard/src/components/networking.tsx
index 5b8e422864..12eea2dd8e 100644
--- a/ui/litellm-dashboard/src/components/networking.tsx
+++ b/ui/litellm-dashboard/src/components/networking.tsx
@@ -73,7 +73,7 @@ export const keyDeleteCall = async (accessToken: String, user_key: String) => {
   try {
     const url = proxyBaseUrl ? `${proxyBaseUrl}/key/delete` : `/key/delete`;
     console.log("in keyDeleteCall:", user_key);
-
+    message.info("Making key delete request");
     const response = await fetch(url, {
       method: "POST",
       headers: {

From 1719f2fabc0a72762cae15c0ee8ae43455559187 Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 6 Feb 2024 22:18:46 -0800
Subject: [PATCH 218/218] fix(ollama_chat.py): fix token counting

---
 litellm/llms/ollama_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py
index 0311931b13..c9d6654c7c 100644
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@@ -230,7 +230,7 @@ def get_ollama_response(
     model_response["model"] = "ollama/" + model
     prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages))  # type: ignore
     completion_tokens = response_json.get(
-        "eval_count", litellm.token_counter(text=response_json["message"])
+        "eval_count", litellm.token_counter(text=response_json["message"]["content"])
     )
     model_response["usage"] = litellm.Usage(
         prompt_tokens=prompt_tokens,