Merge branch 'main' into nvidia-e2e-notebook

2025-07-22 12:37:53 +00:00 · 2025-06-06 11:11:53 -04:00 · 2025-06-06 11:11:53 -04:00 · 1a492ad0cc
commit 1a492ad0cc
parent 6a004e99ed 0d0b8d2be1
200 changed files with 8714 additions and 3175 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -30,6 +30,9 @@ from llama_stack.strong_typing.schema import (
    Schema,
    SchemaOptions,
 )
+from typing import get_origin, get_args
+from typing import Annotated
+from fastapi import UploadFile
 from llama_stack.strong_typing.serialization import json_dump_string, object_to_json

 from .operations import (
@ -618,6 +621,45 @@ class Generator:
                },
                required=True,
            )
+        # data passed in request body as multipart/form-data
+        elif op.multipart_params:
+            builder = ContentBuilder(self.schema_builder)
+            
+            # Create schema properties for multipart form fields
+            properties = {}
+            required_fields = []
+            
+            for name, param_type in op.multipart_params:
+                if get_origin(param_type) is Annotated:
+                    base_type = get_args(param_type)[0]
+                else:
+                    base_type = param_type
+                if base_type is UploadFile:
+                    # File upload
+                    properties[name] = {
+                        "type": "string",
+                        "format": "binary"
+                    }
+                else:
+                    # Form field
+                    properties[name] = self.schema_builder.classdef_to_ref(base_type)
+                
+                required_fields.append(name)
+            
+            multipart_schema = {
+                "type": "object",
+                "properties": properties,
+                "required": required_fields
+            }
+            
+            requestBody = RequestBody(
+                content={
+                    "multipart/form-data": {
+                        "schema": multipart_schema
+                    }
+                },
+                required=True,
+            )
        # data passed in payload as JSON and mapped to request parameters
        elif op.request_params:
            builder = ContentBuilder(self.schema_builder)
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -17,6 +17,12 @@ from termcolor import colored

 from llama_stack.strong_typing.inspection import get_signature

+from typing import get_origin, get_args
+
+from fastapi import UploadFile 
+from fastapi.params import File, Form
+from typing import Annotated
+

 def split_prefix(
    s: str, sep: str, prefix: Union[str, Iterable[str]]
@ -82,6 +88,7 @@ class EndpointOperation:
    :param path_params: Parameters of the operation signature that are passed in the path component of the URL string.
    :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
    :param request_params: The parameter that corresponds to the data transmitted in the request body.
+    :param multipart_params: Parameters that indicate multipart/form-data request body.
    :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
    :param response_type: The Python type of the data that is transmitted in the response body.
    :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
@ -98,6 +105,7 @@ class EndpointOperation:
    path_params: List[OperationParameter]
    query_params: List[OperationParameter]
    request_params: Optional[OperationParameter]
+    multipart_params: List[OperationParameter]
    event_type: Optional[type]
    response_type: type
    http_method: HTTPMethod
@ -252,6 +260,7 @@ def get_endpoint_operations(
        path_params = []
        query_params = []
        request_params = []
+        multipart_params = []

        for param_name, parameter in signature.parameters.items():
            param_type = _get_annotation_type(parameter.annotation, func_ref)
@ -266,6 +275,8 @@ def get_endpoint_operations(
                    f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                )

+            is_multipart = _is_multipart_param(param_type)
+            
            if prefix in ["get", "delete"]:
                if route_params is not None and param_name in route_params:
                    path_params.append((param_name, param_type))
@ -274,6 +285,8 @@ def get_endpoint_operations(
            else:
                if route_params is not None and param_name in route_params:
                    path_params.append((param_name, param_type))
+                elif is_multipart:
+                    multipart_params.append((param_name, param_type))
                else:
                    request_params.append((param_name, param_type))

@ -333,6 +346,7 @@ def get_endpoint_operations(
                path_params=path_params,
                query_params=query_params,
                request_params=request_params,
+                multipart_params=multipart_params,
                event_type=event_type,
                response_type=response_type,
                http_method=http_method,
@ -377,3 +391,34 @@ def get_endpoint_events(endpoint: type) -> Dict[str, type]:
        results[param_type.__name__] = param_type

    return results
+
+
+def _is_multipart_param(param_type: type) -> bool:
+    """
+    Check if a parameter type indicates multipart form data.
+    
+    Returns True if the type is:
+    - UploadFile
+    - Annotated[UploadFile, File()]
+    - Annotated[str, Form()]
+    - Annotated[Any, File()]
+    - Annotated[Any, Form()]
+    """
+    if param_type is UploadFile:
+        return True
+    
+    # Check for Annotated types
+    origin = get_origin(param_type)
+    if origin is None:
+        return False
+    
+    if origin is Annotated:
+        args = get_args(param_type)
+        if len(args) < 2:
+            return False
+        
+        # Check the annotations for File() or Form()
+        for annotation in args[1:]:
+            if isinstance(annotation, (File, Form)):
+                return True
+    return False
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -153,6 +153,12 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
        return "has no return type annotation"

    return_type = hints['return']
+    
+    # Allow OpenAI endpoints to return response objects since they follow OpenAI specification
+    method_name = getattr(method, '__name__', '')
+    if method_name.startswith('openai_'):
+        return None
+    
    if return_type is not None and return_type is not type(None):
        return "does not return None where None is mandatory"

--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -57,6 +57,31 @@ chunks = [
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
+
+#### Using Precomputed Embeddings
+If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
+including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
+want to customize the ingestion process.
+```python
+chunks_with_embeddings = [
+    {
+        "content": "First chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "introduction"},
+    },
+    {
+        "content": "Second chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "methodology"},
+    },
+]
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
+```
+When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
+registering the vector database.
+
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -9,29 +9,24 @@ When instantiating an agent, you can provide it a list of tool groups that it ha

 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.

-## Types of Tool Group providers
+## Server-side vs. client-side tool execution

-There are three types of providers for tool groups that are supported by Llama Stack.
+Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model
+transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution
+and optional continuation using the `agent.resume_turn` method.

-1. Built-in providers
-2. Model Context Protocol (MCP) providers
-3. Client provided tools

-### Built-in providers
+### Server-side tools

-Built-in providers come packaged with Llama Stack. These providers provide common functionalities like web search, code interpretation, and computational capabilities.
+Llama Stack provides built-in providers for some common tools. These include web search, math, and RAG capabilities.

-#### Web Search providers
-There are three web search providers that are supported by Llama Stack.
+#### Web Search

-1. Brave Search
-2. Bing Search
-3. Tavily Search
+You have three providers to execute the web search tool calls generated by a model: Brave Search, Bing Search, and Tavily Search.

-Example client SDK call to register a "websearch" toolgroup that is provided by brave-search.
+To indicate that the web search tool calls should be executed by brave-search, you can point the "builtin::websearch" toolgroup to the "brave-search" provider.

 ```python
-# Register Brave Search tool group
 client.toolgroups.register(
    toolgroup_id="builtin::websearch",
    provider_id="brave-search",
@ -39,17 +34,17 @@ client.toolgroups.register(
 )
 ```

-The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
-
-> **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
+The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is:
+```
+{"<provider_name>_api_key": <your api key>}
+```


-#### WolframAlpha
+#### Math

 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.

 ```python
-# Register WolframAlpha tool group
 client.toolgroups.register(
    toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
 )
@ -83,11 +78,49 @@ Features:

 > **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.

-## Model Context Protocol (MCP) Tools
+## Model Context Protocol (MCP)

-MCP tools are special tools that can interact with llama stack over model context protocol. These tools are dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.
+[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered
+from an MCP endpoint and can be used to extend the agent's capabilities.

-Refer to [https://github.com/modelcontextprotocol/servers](https://github.com/modelcontextprotocol/servers) for available MCP servers.
+
+### Using Remote MCP Servers
+
+You can find some popular remote MCP servers [here](https://github.com/jaw9c/awesome-remote-mcp-servers). You can register them as toolgroups in the same way as local providers.
+
+```python
+client.toolgroups.register(
+    toolgroup_id="mcp::deepwiki",
+    provider_id="model-context-protocol",
+    mcp_endpoint=URL(uri="https://mcp.deepwiki.com/sse"),
+)
+```
+
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server
+using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
+
+```python
+agent = Agent(
+    ...,
+    tools=["mcp::deepwiki"],
+    extra_headers={
+        "X-LlamaStack-Provider-Data": json.dumps(
+            {
+                "mcp_headers": {
+                    "http://mcp.deepwiki.com/sse": {
+                        "Authorization": "Bearer <your_access_token>",
+                    },
+                },
+            }
+        ),
+    },
+)
+agent.create_turn(...)
+```
+
+### Running your own MCP server
+
+Here's an example of how to run a simple MCP server that exposes a File System as a set of tools to the Llama Stack agent.

 ```shell
 # start your MCP server
@ -106,13 +139,9 @@ client.toolgroups.register(
 )
 ```

-MCP tools require:
- A valid MCP endpoint URL
- The endpoint must implement the Model Context Protocol
- Tools are discovered dynamically from the endpoint


-## Adding Custom Tools
+## Adding Custom (Client-side) Tools

 When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
 along to the generative model.
--- a/docs/source/concepts/api_providers.md
+++ b/docs/source/concepts/api_providers.md
@ -0,0 +1,12 @@
+## API Providers
+
+The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
+- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
+- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
+
+Providers come in two flavors:
+- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
+- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
+
+Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -0,0 +1,18 @@
+## APIs
+
+A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
+
+- **Inference**: run inference with a LLM
+- **Safety**: apply safety policies to the output at a Systems (not only model) level
+- **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
+- **DatasetIO**: interface with datasets and data loaders
+- **Scoring**: evaluate outputs of the system
+- **Eval**: generate outputs (via Inference or Agents) and perform scoring
+- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
+- **Telemetry**: collect telemetry data from the system
+
+We are working on adding a few more APIs to complete the application lifecycle. These will include:
+- **Batch Inference**: run inference on a dataset of inputs
+- **Batch Agents**: run agents on a dataset of inputs
+- **Post Training**: fine-tune a Llama model
+- **Synthetic Data Generation**: generate synthetic data for model development
--- a/docs/source/concepts/distributions.md
+++ b/docs/source/concepts/distributions.md
@ -0,0 +1,9 @@
+## Distributions
+
+While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
+
+**Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
+
+**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
+
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@ -1,4 +1,4 @@
-# Evaluation Concepts
+## Evaluation Concepts

 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.

@ -10,11 +10,7 @@ We introduce a set of APIs in Llama Stack for supporting running evaluations of
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).


-## Evaluation Concepts
-
-The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
-
-![Eval Concepts](../references/evals_reference/resources/eval-concept.png)
+The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.

 - **DatasetIO**: defines interface with datasets and data loaders.
  - Associated with `Dataset` resource.
@ -24,9 +20,9 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
  - Associated with `Benchmark` resource.


-## Open-benchmark Eval
+### Open-benchmark Eval

-### List of open-benchmarks Llama Stack support
+#### List of open-benchmarks Llama Stack support

 Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.

@ -39,7 +35,7 @@ The list of open-benchmarks we currently support:

 You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack

-### Run evaluation on open-benchmarks via CLI
+#### Run evaluation on open-benchmarks via CLI

 We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI

@ -74,7 +70,7 @@ evaluation results over there.



-## What's Next?
+#### What's Next?

 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
 - Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -1,74 +1,23 @@
 # Core Concepts

-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-evaluation_concepts
-```
-
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.

-
-## APIs
-
-A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
-
- **Inference**: run inference with a LLM
- **Safety**: apply safety policies to the output at a Systems (not only model) level
- **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
- **DatasetIO**: interface with datasets and data loaders
- **Scoring**: evaluate outputs of the system
- **Eval**: generate outputs (via Inference or Agents) and perform scoring
- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
- **Telemetry**: collect telemetry data from the system
-
-We are working on adding a few more APIs to complete the application lifecycle. These will include:
- **Batch Inference**: run inference on a dataset of inputs
- **Batch Agents**: run agents on a dataset of inputs
- **Post Training**: fine-tune a Llama model
- **Synthetic Data Generation**: generate synthetic data for model development
-
-## API Providers
-
-The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
-
-Providers come in two flavors:
- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
-
-Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
-## Resources
-
-Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
-
- **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
- **Safety** is associated with `Shield` resources.
- **Tool Runtime** is associated with `ToolGroup` resources.
- **DatasetIO** is associated with `Dataset` resources.
- **VectorIO** is associated with `VectorDB` resources.
- **Scoring** is associated with `ScoringFunction` resources.
- **Eval** is associated with `Model` and `Benchmark` resources.
-
-Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
-
-```{admonition} Registering Resources
-:class: tip
-
-Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
+```{include} apis.md
+:start-after: ## APIs
 ```

-## Distributions
+```{include} api_providers.md
+:start-after: ## API Providers
+```

-While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
+```{include} resources.md
+:start-after: ## Resources
+```

-**Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
+```{include} distributions.md
+:start-after: ## Distributions
+```

-**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
-
-
-**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
+```{include} evaluation_concepts.md
+:start-after: ## Evaluation Concepts
+```
--- a/docs/source/concepts/resources.md
+++ b/docs/source/concepts/resources.md
@ -0,0 +1,19 @@
+## Resources
+
+Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
+
+- **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
+- **Safety** is associated with `Shield` resources.
+- **Tool Runtime** is associated with `ToolGroup` resources.
+- **DatasetIO** is associated with `Dataset` resources.
+- **VectorIO** is associated with `VectorDB` resources.
+- **Scoring** is associated with `ScoringFunction` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
+
+Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
+
+```{admonition} Registering Resources
+:class: tip
+
+Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
+```
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -260,7 +260,41 @@ Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM pyth
 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```

-After this step is successful, you should be able to find the built container image and test it with `llama stack run <path/to/run.yaml>`.
+Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
+```
+export INFERENCE_MODEL="llama3.2:3b"
+export LLAMA_STACK_PORT=8321
+mkdir -p ~/.llama
+```
+
+After this step is successful, you should be able to find the built container image and test it with the below Docker command:
+
+```
+docker run -d \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  localhost/distribution-ollama:dev \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
+```
+
+Here are the docker flags and their uses:
+
+* `-d`: Runs the container in the detached mode as a background process
+
+* `-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT`: Maps the container port to the host port for accessing the server
+
+* `-v ~/.llama:/root/.llama`: Mounts the local .llama directory to persist configurations and data
+
+* `localhost/distribution-ollama:dev`: The name and tag of the container image to run
+
+* `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
+
+* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
+
+* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
+
 :::

 ::::
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+export POSTGRES_USER=${POSTGRES_USER:-llamastack}
+export POSTGRES_DB=${POSTGRES_DB:-llamastack}
+export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
+
+export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+set -euo pipefail
+set -x
+
+envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
+envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
+envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
+envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
+
+kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
+  --dry-run=client -o yaml > stack-configmap.yaml
+
+kubectl apply -f stack-configmap.yaml
+
+envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
+envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
+
+envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
@ -0,0 +1,66 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: chromadb-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chromadb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: chromadb
+  template:
+    metadata:
+      labels:
+        app: chromadb
+    spec:
+      containers:
+      - name: chromadb
+        image: chromadb/chroma:latest
+        ports:
+        - containerPort: 6000
+        env:
+        - name: CHROMA_HOST
+          value: "0.0.0.0"
+        - name: CHROMA_PORT
+          value: "6000"
+        - name: PERSIST_DIRECTORY
+          value: "/chroma/chroma"
+        - name: CHROMA_DB_IMPL
+          value: "duckdb+parquet"
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: chromadb-storage
+          mountPath: /chroma/chroma
+      volumes:
+      - name: chromadb-storage
+        persistentVolumeClaim:
+          claimName: chromadb-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: chromadb
+spec:
+  selector:
+    app: chromadb
+  ports:
+  - protocol: TCP
+    port: 6000
+    targetPort: 6000
+  type: ClusterIP
--- a/docs/source/distributions/k8s/ingress-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ingress-k8s.yaml.template
@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-stack-service
+spec:
+  type: LoadBalancer
+  selector:
+    app.kubernetes.io/name: llama-stack
+  ports:
+    - name: llama-stack-api
+      port: 8321
+      targetPort: 8321
+      protocol: TCP
+    - name: llama-stack-ui
+      port: 8322
+      targetPort: 8322
+      protocol: TCP
--- a/docs/source/distributions/k8s/postgres-k8s.yaml.template
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
@ -0,0 +1,66 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: postgres-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: postgres
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15
+        env:
+        - name: POSTGRES_DB
+          value: "${POSTGRES_DB}"
+        - name: POSTGRES_USER
+          value: "${POSTGRES_USER}"
+        - name: POSTGRES_PASSWORD
+          value: "${POSTGRES_PASSWORD}"
+        - name: PGDATA
+          value: "/var/lib/postgresql/data/pgdata"
+        ports:
+        - containerPort: 5432
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+        volumeMounts:
+        - name: postgres-storage
+          mountPath: /var/lib/postgresql/data
+      volumes:
+      - name: postgres-storage
+        persistentVolumeClaim:
+          claimName: postgres-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-server
+spec:
+  selector:
+    app.kubernetes.io/name: postgres
+  ports:
+  - protocol: TCP
+    port: 5432
+    targetPort: 5432
+  type: ClusterIP
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -0,0 +1,128 @@
+apiVersion: v1
+data:
+  stack_run_config.yaml: |
+    version: '2'
+    image_name: kubernetes-demo
+    apis:
+    - agents
+    - inference
+    - safety
+    - telemetry
+    - tool_runtime
+    - vector_io
+    providers:
+      inference:
+      - provider_id: vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_URL:http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+          api_token: ${env.VLLM_API_TOKEN:fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+      - provider_id: vllm-safety
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+          api_token: ${env.VLLM_API_TOKEN:fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+      - provider_id: sentence-transformers
+        provider_type: inline::sentence-transformers
+        config: {}
+      vector_io:
+      - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+        provider_type: remote::chromadb
+        config:
+          url: ${env.CHROMADB_URL:}
+      safety:
+      - provider_id: llama-guard
+        provider_type: inline::llama-guard
+        config:
+          excluded_categories: []
+      agents:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          persistence_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:localhost}
+            port: ${env.POSTGRES_PORT:5432}
+            db: ${env.POSTGRES_DB:llamastack}
+            user: ${env.POSTGRES_USER:llamastack}
+            password: ${env.POSTGRES_PASSWORD:llamastack}
+          responses_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:localhost}
+            port: ${env.POSTGRES_PORT:5432}
+            db: ${env.POSTGRES_DB:llamastack}
+            user: ${env.POSTGRES_USER:llamastack}
+            password: ${env.POSTGRES_PASSWORD:llamastack}
+      telemetry:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          service_name: ${env.OTEL_SERVICE_NAME:}
+          sinks: ${env.TELEMETRY_SINKS:console}
+      tool_runtime:
+      - provider_id: brave-search
+        provider_type: remote::brave-search
+        config:
+          api_key: ${env.BRAVE_SEARCH_API_KEY:}
+          max_results: 3
+      - provider_id: tavily-search
+        provider_type: remote::tavily-search
+        config:
+          api_key: ${env.TAVILY_SEARCH_API_KEY:}
+          max_results: 3
+      - provider_id: rag-runtime
+        provider_type: inline::rag-runtime
+        config: {}
+      - provider_id: model-context-protocol
+        provider_type: remote::model-context-protocol
+        config: {}
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:localhost}
+      port: ${env.POSTGRES_PORT:5432}
+      db: ${env.POSTGRES_DB:llamastack}
+      user: ${env.POSTGRES_USER:llamastack}
+      password: ${env.POSTGRES_PASSWORD:llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:localhost}
+      port: ${env.POSTGRES_PORT:5432}
+      db: ${env.POSTGRES_DB:llamastack}
+      user: ${env.POSTGRES_USER:llamastack}
+      password: ${env.POSTGRES_PASSWORD:llamastack}
+    models:
+    - metadata:
+        embedding_dimension: 384
+      model_id: all-MiniLM-L6-v2
+      provider_id: sentence-transformers
+      model_type: embedding
+    - metadata: {}
+      model_id: ${env.INFERENCE_MODEL}
+      provider_id: vllm-inference
+      model_type: llm
+    - metadata: {}
+      model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+      provider_id: vllm-safety
+      model_type: llm
+    shields:
+    - shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+    vector_dbs: []
+    datasets: []
+    scoring_fns: []
+    benchmarks: []
+    tool_groups:
+    - toolgroup_id: builtin::websearch
+      provider_id: tavily-search
+    - toolgroup_id: builtin::rag
+      provider_id: rag-runtime
+    server:
+      port: 8321
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  name: llama-stack-config
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -0,0 +1,69 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+      app.kubernetes.io/component: server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+        app.kubernetes.io/component: server
+    spec:
+      containers:
+      - name: llama-stack
+        image: llamastack/distribution-remote-vllm:latest
+        imagePullPolicy: Always # since we have specified latest instead of a version
+        env:
+        - name: ENABLE_CHROMADB
+          value: "true"
+        - name: CHROMADB_URL
+          value: http://chromadb.default.svc.cluster.local:6000
+        - name: VLLM_URL
+          value: http://vllm-server.default.svc.cluster.local:8000/v1
+        - name: VLLM_MAX_TOKENS
+          value: "3072"
+        - name: VLLM_SAFETY_URL
+          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
+        - name: POSTGRES_HOST
+          value: postgres-server.default.svc.cluster.local
+        - name: POSTGRES_PORT
+          value: "5432"
+        - name: VLLM_TLS_VERIFY
+          value: "false"
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
+        - name: SAFETY_MODEL
+          value: "${SAFETY_MODEL}"
+        - name: TAVILY_SEARCH_API_KEY
+          value: "${TAVILY_SEARCH_API_KEY}"
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        ports:
+          - containerPort: 8321
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.llama
+          - name: llama-config
+            mountPath: /etc/config
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: llama-pvc
+      - name: llama-config
+        configMap:
+          name: llama-stack-config
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -0,0 +1,121 @@
+version: '2'
+image_name: kubernetes-demo
+apis:
+- agents
+- inference
+- safety
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: vllm-safety
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+models:
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+  provider_id: vllm-safety
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -0,0 +1,62 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-ui
+  labels:
+    app.kubernetes.io/name: llama-stack
+    app.kubernetes.io/component: ui
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+      app.kubernetes.io/component: ui
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+        app.kubernetes.io/component: ui
+    spec:
+      containers:
+      - name: llama-stack-ui
+        image: node:18-alpine
+        command: ["/bin/sh"]
+        env:
+        - name: LLAMA_STACK_BACKEND_URL
+          value: "http://llama-stack-service:8321"
+        - name: LLAMA_STACK_UI_PORT
+          value: "8322"
+        args:
+          - -c
+          - |
+            # Install git (not included in alpine by default)
+            apk add --no-cache git
+
+            # Clone the repository
+            echo "Cloning repository..."
+            git clone https://github.com/meta-llama/llama-stack.git /app
+
+            # Navigate to the UI directory
+            echo "Navigating to UI directory..."
+            cd /app/llama_stack/ui
+
+            # Check if package.json exists
+            if [ ! -f "package.json" ]; then
+              echo "ERROR: package.json not found in $(pwd)"
+              ls -la
+              exit 1
+            fi
+
+            # Install dependencies with verbose output
+            echo "Installing dependencies..."
+            npm install --verbose
+
+            # Verify next is installed
+            echo "Checking if next is installed..."
+            npx next --version || echo "Next.js not found, checking node_modules..."
+            ls -la node_modules/.bin/ | grep next || echo "No next binary found"
+
+            npm run dev
+        ports:
+        - containerPort: 8322
+        workingDir: /app
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -0,0 +1,71 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+        workload-type: inference
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - inference
+            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args:
+        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@ -0,0 +1,73 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models-safety
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  storageClassName: gp2
+  resources:
+    requests:
+      storage: 30Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server-safety
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm-safety
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm-safety
+        workload-type: inference
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - inference
+            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      containers:
+      - name: vllm-safety
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8001
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models-safety
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server-safety
+spec:
+  selector:
+    app.kubernetes.io/name: vllm-safety
+  ports:
+  - protocol: TCP
+    port: 8001
+    targetPort: 8001
+  type: ClusterIP
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -18,6 +18,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
+| files | `inline::localfs` |
 | inference | `remote::fireworks`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -82,7 +82,7 @@ for log in AgentEventLogger().log(response):
 ```
 We will use `uv` to run the script
 ```
-uv run --with llama-stack-client demo_script.py
+uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
 ```
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -103,6 +103,7 @@ getting_started/index
 getting_started/detailed_tutorial
 introduction/index
 concepts/index
+openai/index
 providers/index
 distributions/index
 building_applications/index
--- a/docs/source/openai/index.md
+++ b/docs/source/openai/index.md
@ -0,0 +1,193 @@
+# OpenAI API Compatibility
+
+## Server path
+
+Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.
+
+## Clients
+
+You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.
+
+### Llama Stack Client
+
+When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:8321")
+```
+
+### OpenAI Client
+
+When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")
+```
+
+Regardless of the client you choose, the following code examples should all work the same.
+
+## APIs implemented
+
+### Models
+
+Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:
+
+```python
+models = client.models.list()
+```
+
+### Responses
+
+:::{note}
+The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
+:::
+
+#### Simple inference
+
+Request:
+
+```
+response = client.responses.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    input="Write a haiku about coding."
+)
+
+print(response.output_text)
+```
+Example output:
+
+```text
+Pixels dancing slow
+Syntax whispers secrets sweet
+Code's gentle silence
+```
+
+#### Structured Output
+
+Request:
+
+```python
+response = client.responses.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    input=[
+        {
+            "role": "system",
+            "content": "Extract the participants from the event information.",
+        },
+        {
+            "role": "user",
+            "content": "Alice and Bob are going to a science fair on Friday.",
+        },
+    ],
+    text={
+        "format": {
+            "type": "json_schema",
+            "name": "participants",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "participants": {"type": "array", "items": {"type": "string"}}
+                },
+                "required": ["participants"],
+            },
+        }
+    },
+)
+print(response.output_text)
+```
+
+Example output:
+
+```text
+{ "participants": ["Alice", "Bob"] }
+```
+
+### Chat Completions
+
+#### Simple inference
+
+Request:
+
+```python
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Write a haiku about coding."}],
+)
+
+print(chat_completion.choices[0].message.content)
+```
+
+Example output:
+
+```text
+Lines of code unfold
+Logic flows like a river
+Code's gentle beauty
+```
+
+#### Structured Output
+
+Request:
+
+```python
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[
+        {
+            "role": "system",
+            "content": "Extract the participants from the event information.",
+        },
+        {
+            "role": "user",
+            "content": "Alice and Bob are going to a science fair on Friday.",
+        },
+    ],
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+            "name": "participants",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "participants": {"type": "array", "items": {"type": "string"}}
+                },
+                "required": ["participants"],
+            },
+        },
+    },
+)
+
+print(chat_completion.choices[0].message.content)
+```
+
+Example output:
+
+```text
+{ "participants": ["Alice", "Bob"] }
+```
+
+### Completions
+
+#### Simple inference
+
+Request:
+
+```python
+completion = client.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct", prompt="Write a haiku about coding."
+)
+
+print(completion.choices[0].text)
+```
+
+Example output:
+
+```text
+Lines of code unfurl
+Logic whispers in the dark
+Art in hidden form
+```