diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md
index e350ce9d5..70fd6e6a8 100644
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@@ -79,6 +79,7 @@ curl 'http://0.0.0.0:8000/key/generate' \
   "metadata": {"user": "ishaan@berri.ai"},
   "team_id": "core-infra",
   "max_budget": 10,
+  "soft_budget": 5,
 }'
 ```
 
@@ -93,6 +94,7 @@ Request Params:
 - `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
 - `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
 - `max_budget`: *Optional[float]* - Specify max budget for a given key.
+- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
 - `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
 - `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
 - `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
diff --git a/litellm/__init__.py b/litellm/__init__.py
index cd639ddb9..f218fe036 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -79,6 +79,9 @@ max_budget: float = 0.0  # set the max budget across all providers
 budget_duration: Optional[str] = (
     None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
 )
+default_soft_budget: float = (
+    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+)
 _openai_finish_reasons = ["stop", "length", "function_call", "content_filter", "null"]
 _openai_completion_params = [
     "functions",
diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py
index e981aef6d..ac30977b3 100644
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@@ -151,6 +151,7 @@ class GenerateRequestBase(LiteLLMBase):
     rpm_limit: Optional[int] = None
     budget_duration: Optional[str] = None
     allowed_cache_controls: Optional[list] = []
+    soft_budget: Optional[float] = None
 
 
 class GenerateKeyRequest(GenerateRequestBase):
@@ -327,7 +328,7 @@ class TeamRequest(LiteLLMBase):
 
 class LiteLLM_BudgetTable(LiteLLMBase):
     """Represents user-controllable params for a LiteLLM_BudgetTable record"""
-
+    soft_budget: Optional[float] = None
     max_budget: Optional[float] = None
     max_parallel_requests: Optional[int] = None
     tpm_limit: Optional[int] = None
@@ -366,7 +367,7 @@ class OrganizationRequest(LiteLLMBase):
 class BudgetRequest(LiteLLMBase):
     budgets: List[str]
 
-
+      
 class KeyManagementSystem(enum.Enum):
     GOOGLE_KMS = "google_kms"
     AZURE_KEY_VAULT = "azure_key_vault"
@@ -585,6 +586,7 @@ class LiteLLM_SpendLogs(LiteLLMBase):
     request_id: str
     api_key: str
     model: Optional[str] = ""
+    api_base: Optional[str] = ""
     call_type: str
     spend: Optional[float] = 0.0
     total_tokens: Optional[int] = 0
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index bd3d111a7..1e3d83142 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -791,6 +791,7 @@ async def user_api_key_auth(
                 "/global/spend/keys",
                 "/global/spend/models",
                 "/global/predict/spend/logs",
+                "/health/services",
             ]
             # check if the current route startswith any of the allowed routes
             if (
@@ -1814,6 +1815,9 @@ async def generate_key_helper_fn(
     spend: float,
     key_max_budget: Optional[float] = None,  # key_max_budget is used to Budget Per key
     key_budget_duration: Optional[str] = None,
+    key_soft_budget: Optional[
+        float
+    ] = None,  # key_soft_budget is used to Budget Per key
     max_budget: Optional[float] = None,  # max_budget is used to Budget Per user
     budget_duration: Optional[str] = None,  # max_budget is used to Budget Per user
     token: Optional[str] = None,
@@ -1873,6 +1877,19 @@ async def generate_key_helper_fn(
     rpm_limit = rpm_limit
     allowed_cache_controls = allowed_cache_controls
 
+    # TODO: @ishaan-jaff: Migrate all budget tracking to use LiteLLM_BudgetTable
+    if prisma_client is not None:
+        # create the Budget Row for the LiteLLM Verification Token
+        budget_row = LiteLLM_BudgetTable(
+            soft_budget=key_soft_budget or litellm.default_soft_budget,
+            model_max_budget=model_max_budget or {},
+            created_by=user_id,
+            updated_by=user_id,
+        )
+        new_budget = prisma_client.jsonify_object(budget_row.json(exclude_none=True))
+        _budget = await prisma_client.db.litellm_budgettable.create(data={**new_budget})  # type: ignore
+        _budget_id = getattr(_budget, "id", None)
+
     try:
         # Create a new verification token (you may want to enhance this logic based on your needs)
         user_data = {
@@ -1910,6 +1927,7 @@ async def generate_key_helper_fn(
             "allowed_cache_controls": allowed_cache_controls,
             "permissions": permissions_json,
             "model_max_budget": model_max_budget_json,
+            "budget_id": _budget_id,
         }
         if (
             general_settings.get("allow_user_auth", False) == True
@@ -1982,6 +2000,9 @@ async def generate_key_helper_fn(
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+    # Add budget related info in key_data - this ensures it's returned
+    key_data["soft_budget"] = key_soft_budget
     return key_data
 
 
@@ -2142,14 +2163,6 @@ async def async_data_generator(response, user_api_key_dict):
             except Exception as e:
                 yield f"data: {str(e)}\n\n"
 
-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
         # Streaming is done, yield the [DONE] chunk
         done_message = "[DONE]"
         yield f"data: {done_message}\n\n"
@@ -2497,14 +2510,6 @@ async def completion(
                 headers=custom_headers,
             )
 
-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
         fastapi_response.headers["x-litellm-model-id"] = model_id
         return response
     except Exception as e:
@@ -2703,14 +2708,6 @@ async def chat_completion(
                 headers=custom_headers,
             )
 
-        ### ALERTING ###
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
-
         fastapi_response.headers["x-litellm-model-id"] = model_id
 
         ### CALL HOOKS ### - modify outgoing data
@@ -2918,12 +2915,6 @@ async def embeddings(
 
         ### ALERTING ###
         data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
 
         return response
     except Exception as e:
@@ -3069,12 +3060,6 @@ async def image_generation(
 
         ### ALERTING ###
         data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
 
         return response
     except Exception as e:
@@ -3228,12 +3213,6 @@ async def moderations(
 
         ### ALERTING ###
         data["litellm_status"] = "success"  # used for alerting
-        end_time = time.time()
-        asyncio.create_task(
-            proxy_logging_obj.response_taking_too_long(
-                start_time=start_time, end_time=end_time, type="slow_response"
-            )
-        )
 
         return response
     except Exception as e:
@@ -3378,6 +3357,8 @@ async def generate_key_fn(
         # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
         if "max_budget" in data_json:
             data_json["key_max_budget"] = data_json.pop("max_budget", None)
+        if "soft_budget" in data_json:
+            data_json["key_soft_budget"] = data_json.pop("soft_budget", None)
 
         if "budget_duration" in data_json:
             data_json["key_budget_duration"] = data_json.pop("budget_duration", None)
@@ -6722,6 +6703,50 @@ async def test_endpoint(request: Request):
     return {"route": request.url.path}
 
 
+@router.get(
+    "/health/services",
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+    include_in_schema=False,
+)
+async def health_services_endpoint(
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    service: Literal["slack_budget_alerts"] = fastapi.Query(
+        description="Specify the service being hit."
+    ),
+):
+    """
+    Hidden endpoint.
+
+    Used by the UI to let user check if slack alerting is working as expected.
+    """
+    global general_settings, proxy_logging_obj
+
+    if service is None:
+        raise HTTPException(
+            status_code=400, detail={"error": "Service must be specified."}
+        )
+
+    if service not in ["slack_budget_alerts"]:
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": f"Service must be in list. Service={service}. List={['slack_budget_alerts']}"
+            },
+        )
+
+    test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` my-secret-project \n`Expected Day of Error`: 28th March \n`Current Spend`: 100 \n`Projected Spend at end of month`: 1000 \n
+    """
+
+    if "slack" in general_settings.get("alerting", []):
+        await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
+    else:
+        raise HTTPException(
+            status_code=422,
+            detail={"error": "No slack connection setup. Unable to test this."},
+        )
+
+
 @router.get("/health", tags=["health"], dependencies=[Depends(user_api_key_auth)])
 async def health_endpoint(
     user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma
index 2607cf2b0..93a9b3123 100644
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@@ -11,10 +11,11 @@ generator client {
 model LiteLLM_BudgetTable {
   budget_id String @id @default(uuid())
   max_budget Float?
+  soft_budget Float?
   max_parallel_requests Int?
   tpm_limit     BigInt?
   rpm_limit     BigInt?
-  model_max_budget Json @default("{}")
+  model_max_budget Json?
   budget_duration String? 
   budget_reset_at DateTime?
   created_at    DateTime               @default(now()) @map("created_at")
@@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
     allowed_cache_controls String[] @default([])
     model_spend      Json @default("{}")
     model_max_budget Json @default("{}")
+    budget_id String?
 }
 
 // store proxy config.yaml
@@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
   startTime           DateTime // Assuming start_time is a DateTime field
   endTime             DateTime // Assuming end_time is a DateTime field
   model               String   @default("")
+  api_base            String   @default("")
   user                String   @default("")
   metadata            Json     @default("{}")
   cache_hit           String   @default("")
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 1cc52401a..c67448c86 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -64,6 +64,7 @@ class ProxyLogging:
         litellm.callbacks.append(self.max_parallel_request_limiter)
         litellm.callbacks.append(self.max_budget_limiter)
         litellm.callbacks.append(self.cache_control_check)
+        litellm.callbacks.append(self.response_taking_too_long_callback)
         for callback in litellm.callbacks:
             if callback not in litellm.input_callback:
                 litellm.input_callback.append(callback)
@@ -142,6 +143,30 @@ class ProxyLogging:
                 raise e
         return data
 
+    async def response_taking_too_long_callback(
+        self,
+        kwargs,  # kwargs to completion
+        completion_response,  # response from completion
+        start_time,
+        end_time,  # start/end time
+    ):
+        if self.alerting is None:
+            return
+        time_difference = end_time - start_time
+        # Convert the timedelta to float (in seconds)
+        time_difference_float = time_difference.total_seconds()
+        litellm_params = kwargs.get("litellm_params", {})
+        api_base = litellm_params.get("api_base", "")
+        model = kwargs.get("model", "")
+        messages = kwargs.get("messages", "")
+        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
+        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
+        if time_difference_float > self.alerting_threshold:
+            await self.alerting_handler(
+                message=slow_message + request_info,
+                level="Low",
+            )
+
     async def response_taking_too_long(
         self,
         start_time: Optional[float] = None,
@@ -189,16 +214,6 @@ class ProxyLogging:
                     level="Medium",
                 )
 
-        elif (
-            type == "slow_response" and start_time is not None and end_time is not None
-        ):
-            slow_message = f"`Responses are slow - {round(end_time-start_time,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
-            if end_time - start_time > self.alerting_threshold:
-                await self.alerting_handler(
-                    message=slow_message + request_info,
-                    level="Low",
-                )
-
     async def budget_alerts(
         self,
         type: Literal[
@@ -1585,6 +1600,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
         "completion_tokens": usage.get("completion_tokens", 0),
         "request_tags": metadata.get("tags", []),
         "end_user": kwargs.get("user", ""),
+        "api_base": litellm_params.get("api_base", ""),
     }
 
     verbose_proxy_logger.debug(f"SpendTable: created payload - payload: {payload}\n\n")
diff --git a/pyproject.toml b/pyproject.toml
index 0dbe465c3..65e8645fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.28.8"
+version = "1.28.9"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.28.8"
+version = "1.28.9"
 version_files = [
     "pyproject.toml:^version"
 ]
diff --git a/schema.prisma b/schema.prisma
index e7932d634..93a9b3123 100644
--- a/schema.prisma
+++ b/schema.prisma
@@ -11,6 +11,7 @@ generator client {
 model LiteLLM_BudgetTable {
   budget_id String @id @default(uuid())
   max_budget Float?
+  soft_budget Float?
   max_parallel_requests Int?
   tpm_limit     BigInt?
   rpm_limit     BigInt?
@@ -107,6 +108,7 @@ model LiteLLM_VerificationToken {
     allowed_cache_controls String[] @default([])
     model_spend      Json @default("{}")
     model_max_budget Json @default("{}")
+    budget_id String?
 }
 
 // store proxy config.yaml
@@ -127,6 +129,7 @@ model LiteLLM_SpendLogs {
   startTime           DateTime // Assuming start_time is a DateTime field
   endTime             DateTime // Assuming end_time is a DateTime field
   model               String   @default("")
+  api_base            String   @default("")
   user                String   @default("")
   metadata            Json     @default("{}")
   cache_hit           String   @default("")
diff --git a/ui/litellm-dashboard/src/components/create_key_button.tsx b/ui/litellm-dashboard/src/components/create_key_button.tsx
index 3dddaf8b7..e76e2d0c2 100644
--- a/ui/litellm-dashboard/src/components/create_key_button.tsx
+++ b/ui/litellm-dashboard/src/components/create_key_button.tsx
@@ -2,7 +2,7 @@
 
 import React, { useState, useEffect, useRef } from "react";
 import { Button, TextInput, Grid, Col } from "@tremor/react";
-import { Card, Metric, Text } from "@tremor/react";
+import { Card, Metric, Text, Title, Subtitle } from "@tremor/react";
 import {
   Button as Button2,
   Modal,
@@ -38,6 +38,7 @@ const CreateKey: React.FC<CreateKeyProps> = ({
   const [form] = Form.useForm();
   const [isModalVisible, setIsModalVisible] = useState(false);
   const [apiKey, setApiKey] = useState(null);
+  const [softBudget, setSoftBudget] = useState(null);
   const handleOk = () => {
     setIsModalVisible(false);
     form.resetFields();
@@ -54,8 +55,11 @@ const CreateKey: React.FC<CreateKeyProps> = ({
       message.info("Making API Call");
       setIsModalVisible(true);
       const response = await keyCreateCall(accessToken, userID, formValues);
+
+      console.log("key create Response:", response);
       setData((prevData) => (prevData ? [...prevData, response] : [response])); // Check if prevData is null
       setApiKey(response["key"]);
+      setSoftBudget(response["soft_budget"]);
       message.success("API Key Created");
       form.resetFields();
       localStorage.removeItem("userData" + userID);
@@ -108,6 +112,9 @@ const CreateKey: React.FC<CreateKeyProps> = ({
                   ))}
                 </Select>
               </Form.Item>
+              <Form.Item label="Soft Budget (USD) Monthly" name="soft_budget" initialValue={50.00}>
+                <InputNumber step={0.01} precision={2} defaultValue={50.00} width={200} />
+              </Form.Item>
               <Form.Item label="Max Budget (USD)" name="max_budget">
                 <InputNumber step={0.01} precision={2} width={200} />
               </Form.Item>
@@ -154,28 +161,38 @@ const CreateKey: React.FC<CreateKeyProps> = ({
       </Modal>
       {apiKey && (
         <Modal
-          title="Save your key"
           visible={isModalVisible}
           onOk={handleOk}
           onCancel={handleCancel}
           footer={null}
         >
           <Grid numItems={1} className="gap-2 w-full">
-            <Col numColSpan={1}>
-              <p>
-                Please save this secret key somewhere safe and accessible. For
-                security reasons, <b>you will not be able to view it again</b>{" "}
-                through your LiteLLM account. If you lose this secret key, you
-                will need to generate a new one.
-              </p>
-            </Col>
-            <Col numColSpan={1}>
-              {apiKey != null ? (
-                <Text>API Key: {apiKey}</Text>
-              ) : (
-                <Text>Key being created, this might take 30s</Text>
-              )}
-            </Col>
+            <Card>
+              <Title>Save your Key</Title>
+              <Col numColSpan={1}>
+                <p>
+                  Please save this secret key somewhere safe and accessible. For
+                  security reasons, <b>you will not be able to view it again</b>{" "}
+                  through your LiteLLM account. If you lose this secret key, you
+                  will need to generate a new one.
+                </p>
+              </Col>
+              <Col numColSpan={1}>
+                {apiKey != null ? (
+                  <div>
+                    <Text>API Key: {apiKey}</Text>
+                    <Title className="mt-6">Budgets</Title>
+                      <Text>Soft Limit Budget: ${softBudget}</Text>
+                      <Button className="mt-3">
+                        Test Alert
+                      </Button>
+
+                  </div>
+                ) : (
+                  <Text>Key being created, this might take 30s</Text>
+                )}
+              </Col>
+            </Card>
           </Grid>
         </Modal>
       )}
diff --git a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
index 0788af209..f0916ec01 100644
--- a/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
+++ b/ui/litellm-dashboard/src/components/view_key_spend_report.tsx
@@ -105,7 +105,7 @@ const ViewKeySpendReport: React.FC<ViewKeySpendReportProps> = ({
 
   return (
     <div>
-      <Button size = "xs" onClick={showModal}>
+      <Button size = "xs" onClick={showModal} variant="secondary">
         View Spend Report
       </Button>
       <Modal