diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 17cf92341..423bd27e6 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -1045,6 +1045,27 @@
                 ]
             }
         },
+        "/v1/inference/health": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A dictionary containing the health status of the inference service.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HealthResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "Retrieve the health status of the inference service.",
+                "parameters": []
+            }
+        },
         "/v1/models/{model_id}": {
             "get": {
                 "responses": {
@@ -5742,6 +5763,27 @@
                     "type"
                 ]
             },
+            "HealthResponse": {
+                "type": "object",
+                "properties": {
+                    "health": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string",
+                            "enum": [
+                                "OK",
+                                "Error",
+                                "Not Implemented"
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "health"
+                ],
+                "description": "HealthResponse is a model representing the health status response.\nparam health: A dictionary containing health information."
+            },
             "Model": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index f63374406..aef989bdb 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -629,6 +629,21 @@ paths:
           required: true
           schema:
             type: string
+  /v1/inference/health:
+    get:
+      responses:
+        '200':
+          description: >-
+            A dictionary containing the health status of the inference service.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/HealthResponse'
+      tags:
+        - Inference
+      description: >-
+        Retrieve the health status of the inference service.
+      parameters: []
   /v1/models/{model_id}:
     get:
       responses:
@@ -3673,6 +3688,24 @@ components:
       additionalProperties: false
       required:
         - type
+    HealthResponse:
+      type: object
+      properties:
+        health:
+          type: object
+          additionalProperties:
+            type: string
+            enum:
+              - OK
+              - Error
+              - Not Implemented
+      additionalProperties: false
+      required:
+        - health
+      description: >-
+        HealthResponse is a model representing the health status response.
+
+        param health: A dictionary containing health information.
     Model:
       type: object
       properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 433ba3274..571ef61b4 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -389,6 +389,30 @@ class EmbeddingsResponse(BaseModel):
     embeddings: List[List[float]]
 
 
+class HealthStatus(str, Enum):
+    OK = "OK"
+    ERROR = "Error"
+    NOT_IMPLEMENTED = "Not Implemented"
+
+
+@json_schema_type
+class HealthResponse(BaseModel):
+    """
+    HealthResponse is a model representing the health status response.
+
+    param health: A dictionary containing health information.
+    """
+
+    health: Dict[str, HealthStatus]
+
+    @field_validator("health")
+    @classmethod
+    def check_status_present(cls, v):
+        if "status" not in v:
+            raise ValueError("'status' must be present in the health dictionary.")
+        return v
+
+
 class ModelStore(Protocol):
     def get_model(self, identifier: str) -> Model: ...
 
@@ -481,3 +505,11 @@ class Inference(Protocol):
         :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
         """
         ...
+
+    @webmethod(route="/inference/health", method="GET")
+    async def get_health(self) -> HealthResponse:
+        """Retrieve the health status of the inference service.
+
+        :returns: A dictionary containing the health status of the inference service.
+        """
+        ...
diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py
index 4a647a2d9..3084cf03e 100644
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@@ -28,7 +28,6 @@ class RouteInfo(BaseModel):
 @json_schema_type
 class HealthInfo(BaseModel):
     status: str
-    # TODO: add a provider level status
 
 
 @json_schema_type
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index f45975189..79707284c 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -17,6 +17,8 @@ from llama_stack.apis.eval import (
 )
 from llama_stack.apis.inference import (
     EmbeddingsResponse,
+    HealthResponse,
+    HealthStatus,
     Inference,
     LogProbConfig,
     Message,
@@ -210,6 +212,17 @@ class InferenceRouter(Inference):
             contents=contents,
         )
 
+    async def get_health(self) -> HealthResponse:
+        health_statuses = {}
+        for provider_id, impl in self.routing_table.impls_by_provider_id.items():
+            try:
+                health_statuses[provider_id] = await impl.health()
+            except NotImplementedError:
+                health_statuses[provider_id] = HealthResponse(health={"status": HealthStatus.NOT_IMPLEMENTED})
+            except Exception as e:
+                health_statuses[provider_id] = HealthResponse(health={"status": HealthStatus.ERROR, "message": str(e)})
+        return health_statuses
+
 
 class SafetyRouter(Safety):
     def __init__(
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index c79f97def..269d17d60 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
+    HealthResponse,
     Inference,
     InterleavedContent,
     LogProbConfig,
@@ -428,3 +429,8 @@ class MetaReferenceInferenceImpl(
         else:
             for x in impl():
                 yield x
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index 6a83836e6..37531004f 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -9,6 +9,7 @@ from typing import AsyncGenerator, List, Optional, Union
 
 from llama_stack.apis.inference import (
     CompletionResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -75,3 +76,8 @@ class SentenceTransformersInferenceImpl(
         tool_config: Optional[ToolConfig] = None,
     ) -> AsyncGenerator:
         raise ValueError("Sentence transformers don't support chat completion")
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 5536ea3a5..b5d1beb69 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
     CompletionResponse,
     CompletionResponseStreamChunk,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -230,5 +231,10 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
             yield chunk
 
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
+
     async def embeddings(self, model_id: str, contents: List[InterleavedContent]) -> EmbeddingsResponse:
         raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index e896f0597..f14eb49f3 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -198,3 +199,8 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
             response_body = json.loads(response.get("body").read())
             embeddings.append(response_body.get("embedding"))
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 1ce267e8d..fceff5e77 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -16,6 +16,7 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -191,3 +192,8 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
         contents: List[InterleavedContent],
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 3d306e61f..106ca38e9 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -15,6 +15,7 @@ from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -140,3 +141,8 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
         contents: List[InterleavedContent],
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index acf37b248..7119a77b5 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -297,3 +298,8 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
 
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index 441b6af5c..6f0e93e28 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     CompletionResponse,
     CompletionResponseStreamChunk,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     InterleavedContent,
     LogProbConfig,
@@ -154,3 +155,8 @@ class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderD
                     'Pass Groq API Key in the header X-LlamaStack-Provider-Data as { "groq_api_key": "<your api key>" }'
                 )
             return Groq(api_key=provider_data.groq_api_key)
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 0c5b7c454..42ade17b7 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -17,6 +17,7 @@ from llama_stack.apis.inference import (
     CompletionResponse,
     CompletionResponseStreamChunk,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     InterleavedContent,
     LogProbConfig,
@@ -201,3 +202,8 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         else:
             # we pass n=1 to get only one completion
             return convert_openai_chat_completion_choice(response.choices[0])
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index f524c0734..e99342435 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -22,6 +22,8 @@ from llama_stack.apis.inference import (
     ChatCompletionResponse,
     CompletionRequest,
     EmbeddingsResponse,
+    HealthResponse,
+    HealthStatus,
     Inference,
     LogProbConfig,
     Message,
@@ -369,6 +371,22 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
 
         return model
 
+    async def get_health(self) -> HealthResponse:
+        """
+        Performs a health check by initializing the service.
+
+        This method is used by the inspect API endpoint to verify that the service is running
+        correctly.
+
+        Returns:
+            HealthResponse: A dictionary containing the health status.
+        """
+        try:
+            await self.initialize()
+            return HealthResponse(health={"status": HealthStatus.OK})
+        except ConnectionError as e:
+            return HealthResponse(health={"status": HealthStatus.ERROR, "message": str(e)})
+
 
 async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
     async def _convert_content(content) -> dict:
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 1abb17336..f60ea192e 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -125,3 +125,8 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
         contents: List[InterleavedTextMedia],
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index b906e0dcb..5da55bd2b 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -326,3 +326,8 @@ class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
         ]
 
         return compitable_tool_calls
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 1909e01f8..481768c40 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -18,6 +18,7 @@ from llama_stack.apis.inference import (
     ChatCompletionResponse,
     CompletionRequest,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -308,3 +309,8 @@ class InferenceEndpointAdapter(_HfAdapter):
         self.client = endpoint.async_client
         self.model_id = endpoint.repository
         self.max_tokens = int(endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"])
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 054501da8..84ff8166a 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -16,6 +16,7 @@ from llama_stack.apis.inference import (
     ChatCompletionResponse,
     CompletionRequest,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -274,3 +275,8 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         )
         embeddings = [item.embedding for item in r.data]
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index b22284302..094294b7c 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -24,6 +24,7 @@ from llama_stack.apis.inference import (
     CompletionResponse,
     CompletionResponseStreamChunk,
     EmbeddingsResponse,
+    HealthResponse,
     Inference,
     LogProbConfig,
     Message,
@@ -375,3 +376,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
 
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def get_health(
+        self,
+    ) -> HealthResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/tests/inference/test_health.py b/llama_stack/providers/tests/inference/test_health.py
new file mode 100644
index 000000000..7c92e11cb
--- /dev/null
+++ b/llama_stack/providers/tests/inference/test_health.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+from llama_stack.apis.inference import HealthResponse
+
+# How to run this test:
+# pytest -v -s llama_stack/providers/tests/inference/test_health.py
+
+
+class TestHeatlh:
+    @pytest.mark.asyncio
+    async def test_health(self, inference_stack):
+        inference_impl, _ = inference_stack
+        response = await inference_impl.health()
+        for key in response:
+            assert isinstance(response[key], HealthResponse)
+            assert response[key].health["status"] == "OK", response