diff --git a/.github/actions/setup-ollama/action.yml b/.github/actions/setup-ollama/action.yml
index 3dd6c940c..1f6e9818b 100644
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@@ -1,26 +1,9 @@
 name: Setup Ollama
-description: Start Ollama and cache model
-inputs:
-  models:
-    description: Comma-separated list of models to pull
-    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
+description: Start Ollama
 runs:
   using: "composite"
   steps:
-    - name: Install and start Ollama
+    - name: Start Ollama
       shell: bash
       run: |
-        # the ollama installer also starts the ollama service
-        curl -fsSL https://ollama.com/install.sh | sh
-
-    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
-    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
-    # pull them directly.
-    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
-    - name: Pull requested models
-      if: inputs.models != ''
-      shell: bash
-      run: |
-        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
-          ollama pull "$model"
-        done
+        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml
index 6cba4fdc3..cdd438eb2 100644
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@@ -1,12 +1,17 @@
 name: Setup runner
 description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
+inputs:
+  python-version:
+    description: The Python version to use
+    required: false
+    default: "3.10"
 runs:
   using: "composite"
   steps:
     - name: Install uv
       uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
       with:
-        python-version: "3.10"
+        python-version: ${{ inputs.python-version }}
         activate-environment: true
         version: 0.7.6
 
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index d78e82c9d..7aa8b5807 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -26,6 +26,7 @@ jobs:
         # TODO: generate matrix list from tests/integration when fixed
         test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
         client-type: [library, http]
+        python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false # we want to run all tests regardless of failure
 
     steps:
@@ -34,20 +35,22 @@ jobs:
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
+        with:
+          python-version: ${{ matrix.python-version }}
 
       - name: Setup ollama
         uses: ./.github/actions/setup-ollama
 
       - name: Build Llama Stack
         run: |
-          llama stack build --template ollama --image-type venv
+          uv run llama stack build --template ollama --image-type venv
 
       - name: Start Llama Stack server in background
         if: matrix.client-type == 'http'
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
 
       - name: Wait for Llama Stack server to be ready
         if: matrix.client-type == 'http'
@@ -84,6 +87,7 @@ jobs:
       - name: Run Integration Tests
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          OLLAMA_URL: "http://0.0.0.0:11434"
         run: |
           if [ "${{ matrix.client-type }}" == "library" ]; then
             stack_config="ollama"
@@ -104,13 +108,13 @@ jobs:
       - name: Write ollama logs to file
         if: ${{ always() }}
         run: |
-          sudo journalctl -u ollama.service > ollama.log
+          sudo docker logs ollama > ollama.log
 
       - name: Upload all logs to artifacts
         if: ${{ always() }}
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
           path: |
             *.log
           retention-days: 1
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index cf53459b9..8268a0085 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -10,6 +10,7 @@ on:
       - 'llama_stack/distribution/build.*'
       - 'llama_stack/distribution/*.sh'
       - '.github/workflows/providers-build.yml'
+      - 'llama_stack/templates/**'
   pull_request:
     paths:
       - 'llama_stack/cli/stack/build.py'
@@ -17,6 +18,7 @@ on:
       - 'llama_stack/distribution/build.*'
       - 'llama_stack/distribution/*.sh'
       - '.github/workflows/providers-build.yml'
+      - 'llama_stack/templates/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 10e3f6cee..0b72f48c8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -72,7 +72,7 @@ source .venv/bin/activate
 ```
 
 > [!NOTE]
-> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory.
+> You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.11`)
 > Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 > For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 9c1c3170f..775eb93b3 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -633,90 +633,6 @@
                 }
             }
         },
-        "/v1/files": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "A ListBucketResponse.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListBucketResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "List all buckets.",
-                "parameters": [
-                    {
-                        "name": "bucket",
-                        "in": "query",
-                        "description": "Bucket name (valid chars: a-zA-Z0-9_-).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A FileUploadResponse.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/FileUploadResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "Create a new upload session for a file identified by a bucket and key.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CreateUploadSessionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/agents/{agent_id}": {
             "get": {
                 "responses": {
@@ -901,101 +817,6 @@
                 ]
             }
         },
-        "/v1/files/{bucket}/{key}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "A FileResponse.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/FileResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "Get a file info identified by a bucket and key.",
-                "parameters": [
-                    {
-                        "name": "bucket",
-                        "in": "path",
-                        "description": "Bucket name (valid chars: a-zA-Z0-9_-).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "key",
-                        "in": "path",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "Delete a file identified by a bucket and key.",
-                "parameters": [
-                    {
-                        "name": "bucket",
-                        "in": "path",
-                        "description": "Bucket name (valid chars: a-zA-Z0-9_-).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "key",
-                        "in": "path",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/inference/embeddings": {
             "post": {
                 "responses": {
@@ -1979,108 +1800,6 @@
                 "parameters": []
             }
         },
-        "/v1/files/session:{upload_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "A FileUploadResponse.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/FileUploadResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "Returns information about an existsing upload session.",
-                "parameters": [
-                    {
-                        "name": "upload_id",
-                        "in": "path",
-                        "description": "ID of the upload session.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "A FileResponse or None if the upload is not complete.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/FileResponse"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "Upload file content to an existing upload session. On the server, request body will have the raw bytes that are uploaded.",
-                "parameters": [
-                    {
-                        "name": "upload_id",
-                        "in": "path",
-                        "description": "ID of the upload session.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/octet-stream": {
-                            "schema": {
-                                "type": "string",
-                                "format": "binary"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/vector-dbs/{vector_db_id}": {
             "get": {
                 "responses": {
@@ -2877,49 +2596,6 @@
                 }
             }
         },
-        "/v1/files/{bucket}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "A ListFileResponse.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListFileResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Files"
-                ],
-                "description": "List all files in a bucket.",
-                "parameters": [
-                    {
-                        "name": "bucket",
-                        "in": "path",
-                        "description": "Bucket name (valid chars: a-zA-Z0-9_-).",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/models": {
             "get": {
                 "responses": {
@@ -3607,6 +3283,257 @@
                 }
             }
         },
+        "/v1/openai/v1/files/{file_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIFileObject containing file information.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIFileObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Files"
+                ],
+                "description": "Returns information about a specific file.",
+                "parameters": [
+                    {
+                        "name": "file_id",
+                        "in": "path",
+                        "description": "The ID of the file to use for this request.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIFileDeleteResponse indicating successful deletion.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIFileDeleteResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Files"
+                ],
+                "description": "Delete a file.",
+                "parameters": [
+                    {
+                        "name": "file_id",
+                        "in": "path",
+                        "description": "The ID of the file to use for this request.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/openai/v1/embeddings": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIEmbeddingsResponse containing the embeddings.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIEmbeddingsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/OpenaiEmbeddingsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/openai/v1/files": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An ListOpenAIFileResponse containing the list of files.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListOpenAIFileResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Files"
+                ],
+                "description": "Returns a list of files that belong to the user's organization.",
+                "parameters": [
+                    {
+                        "name": "after",
+                        "in": "query",
+                        "description": "A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "A limit on the number of objects to be returned. Limit can range between 1 and 10,000, and the default is 10,000.",
+                        "required": false,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    },
+                    {
+                        "name": "order",
+                        "in": "query",
+                        "description": "Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.",
+                        "required": false,
+                        "schema": {
+                            "$ref": "#/components/schemas/Order"
+                        }
+                    },
+                    {
+                        "name": "purpose",
+                        "in": "query",
+                        "description": "Only return files with the given purpose.",
+                        "required": false,
+                        "schema": {
+                            "$ref": "#/components/schemas/OpenAIFilePurpose"
+                        }
+                    }
+                ]
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIFileObject representing the uploaded file.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIFileObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Files"
+                ],
+                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "multipart/form-data": {
+                            "schema": {
+                                "type": "object",
+                                "properties": {
+                                    "file": {
+                                        "type": "string",
+                                        "format": "binary"
+                                    },
+                                    "purpose": {
+                                        "$ref": "#/components/schemas/OpenAIFilePurpose"
+                                    }
+                                },
+                                "required": [
+                                    "file",
+                                    "purpose"
+                                ]
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/openai/v1/models": {
             "get": {
                 "responses": {
@@ -3640,6 +3567,49 @@
                 "parameters": []
             }
         },
+        "/v1/openai/v1/files/{file_id}/content": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The raw file content as a binary response.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Response"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Files"
+                ],
+                "description": "Returns the contents of the specified file.",
+                "parameters": [
+                    {
+                        "name": "file_id",
+                        "in": "path",
+                        "description": "The ID of the file to use for this request.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/post-training/preference-optimize": {
             "post": {
                 "responses": {
@@ -7271,6 +7241,79 @@
                 ],
                 "title": "OpenAIResponseOutputMessageWebSearchToolCall"
             },
+            "OpenAIResponseText": {
+                "type": "object",
+                "properties": {
+                    "format": {
+                        "type": "object",
+                        "properties": {
+                            "type": {
+                                "oneOf": [
+                                    {
+                                        "type": "string",
+                                        "const": "text"
+                                    },
+                                    {
+                                        "type": "string",
+                                        "const": "json_schema"
+                                    },
+                                    {
+                                        "type": "string",
+                                        "const": "json_object"
+                                    }
+                                ],
+                                "description": "Must be \"text\", \"json_schema\", or \"json_object\" to identify the format type"
+                            },
+                            "name": {
+                                "type": "string",
+                                "description": "The name of the response format. Only used for json_schema."
+                            },
+                            "schema": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "array"
+                                        },
+                                        {
+                                            "type": "object"
+                                        }
+                                    ]
+                                },
+                                "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema."
+                            },
+                            "description": {
+                                "type": "string",
+                                "description": "(Optional) A description of the response format. Only used for json_schema."
+                            },
+                            "strict": {
+                                "type": "boolean",
+                                "description": "(Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "type"
+                        ],
+                        "title": "OpenAIResponseTextFormat",
+                        "description": "Configuration for Responses API text format."
+                    }
+                },
+                "additionalProperties": false,
+                "title": "OpenAIResponseText"
+            },
             "CreateOpenaiResponseRequest": {
                 "type": "object",
                 "properties": {
@@ -7308,11 +7351,17 @@
                     "temperature": {
                         "type": "number"
                     },
+                    "text": {
+                        "$ref": "#/components/schemas/OpenAIResponseText"
+                    },
                     "tools": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/OpenAIResponseInputTool"
                         }
+                    },
+                    "max_infer_iters": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
@@ -7378,6 +7427,9 @@
                     "temperature": {
                         "type": "number"
                     },
+                    "text": {
+                        "$ref": "#/components/schemas/OpenAIResponseText"
+                    },
                     "top_p": {
                         "type": "number"
                     },
@@ -7396,7 +7448,8 @@
                     "object",
                     "output",
                     "parallel_tool_calls",
-                    "status"
+                    "status",
+                    "text"
                 ],
                 "title": "OpenAIResponseObject"
             },
@@ -7540,9 +7593,57 @@
                     {
                         "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
                     },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemAdded"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemDone"
+                    },
                     {
                         "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta"
                     },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDone"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallInProgress"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallSearching"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallCompleted"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsInProgress"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsFailed"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsCompleted"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDone"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
+                    },
                     {
                         "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                     }
@@ -7551,7 +7652,23 @@
                     "propertyName": "type",
                     "mapping": {
                         "response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
+                        "response.output_item.added": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemAdded",
+                        "response.output_item.done": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemDone",
                         "response.output_text.delta": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta",
+                        "response.output_text.done": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDone",
+                        "response.function_call_arguments.delta": "#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta",
+                        "response.function_call_arguments.done": "#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone",
+                        "response.web_search_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallInProgress",
+                        "response.web_search_call.searching": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallSearching",
+                        "response.web_search_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallCompleted",
+                        "response.mcp_list_tools.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsInProgress",
+                        "response.mcp_list_tools.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsFailed",
+                        "response.mcp_list_tools.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsCompleted",
+                        "response.mcp_call.arguments.delta": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta",
+                        "response.mcp_call.arguments.done": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDone",
+                        "response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
+                        "response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
+                        "response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
                         "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                     }
                 }
@@ -7594,6 +7711,314 @@
                 ],
                 "title": "OpenAIResponseObjectStreamResponseCreated"
             },
+            "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta": {
+                "type": "object",
+                "properties": {
+                    "delta": {
+                        "type": "string"
+                    },
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.function_call_arguments.delta",
+                        "default": "response.function_call_arguments.delta"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "delta",
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta"
+            },
+            "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone": {
+                "type": "object",
+                "properties": {
+                    "arguments": {
+                        "type": "string"
+                    },
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.function_call_arguments.done",
+                        "default": "response.function_call_arguments.done"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "arguments",
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone"
+            },
+            "OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta": {
+                "type": "object",
+                "properties": {
+                    "delta": {
+                        "type": "string"
+                    },
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_call.arguments.delta",
+                        "default": "response.mcp_call.arguments.delta"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "delta",
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta"
+            },
+            "OpenAIResponseObjectStreamResponseMcpCallArgumentsDone": {
+                "type": "object",
+                "properties": {
+                    "arguments": {
+                        "type": "string"
+                    },
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_call.arguments.done",
+                        "default": "response.mcp_call.arguments.done"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "arguments",
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpCallArgumentsDone"
+            },
+            "OpenAIResponseObjectStreamResponseMcpCallCompleted": {
+                "type": "object",
+                "properties": {
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_call.completed",
+                        "default": "response.mcp_call.completed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpCallCompleted"
+            },
+            "OpenAIResponseObjectStreamResponseMcpCallFailed": {
+                "type": "object",
+                "properties": {
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_call.failed",
+                        "default": "response.mcp_call.failed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpCallFailed"
+            },
+            "OpenAIResponseObjectStreamResponseMcpCallInProgress": {
+                "type": "object",
+                "properties": {
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_call.in_progress",
+                        "default": "response.mcp_call.in_progress"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpCallInProgress"
+            },
+            "OpenAIResponseObjectStreamResponseMcpListToolsCompleted": {
+                "type": "object",
+                "properties": {
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_list_tools.completed",
+                        "default": "response.mcp_list_tools.completed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpListToolsCompleted"
+            },
+            "OpenAIResponseObjectStreamResponseMcpListToolsFailed": {
+                "type": "object",
+                "properties": {
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_list_tools.failed",
+                        "default": "response.mcp_list_tools.failed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpListToolsFailed"
+            },
+            "OpenAIResponseObjectStreamResponseMcpListToolsInProgress": {
+                "type": "object",
+                "properties": {
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.mcp_list_tools.in_progress",
+                        "default": "response.mcp_list_tools.in_progress"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseMcpListToolsInProgress"
+            },
+            "OpenAIResponseObjectStreamResponseOutputItemAdded": {
+                "type": "object",
+                "properties": {
+                    "response_id": {
+                        "type": "string"
+                    },
+                    "item": {
+                        "$ref": "#/components/schemas/OpenAIResponseOutput"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.output_item.added",
+                        "default": "response.output_item.added"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response_id",
+                    "item",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseOutputItemAdded"
+            },
+            "OpenAIResponseObjectStreamResponseOutputItemDone": {
+                "type": "object",
+                "properties": {
+                    "response_id": {
+                        "type": "string"
+                    },
+                    "item": {
+                        "$ref": "#/components/schemas/OpenAIResponseOutput"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.output_item.done",
+                        "default": "response.output_item.done"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response_id",
+                    "item",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseOutputItemDone"
+            },
             "OpenAIResponseObjectStreamResponseOutputTextDelta": {
                 "type": "object",
                 "properties": {
@@ -7629,64 +8054,121 @@
                 ],
                 "title": "OpenAIResponseObjectStreamResponseOutputTextDelta"
             },
-            "CreateUploadSessionRequest": {
+            "OpenAIResponseObjectStreamResponseOutputTextDone": {
                 "type": "object",
                 "properties": {
-                    "bucket": {
-                        "type": "string",
-                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)."
+                    "content_index": {
+                        "type": "integer"
                     },
-                    "key": {
-                        "type": "string",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)."
+                    "text": {
+                        "type": "string"
                     },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "MIME type of the file."
+                    "item_id": {
+                        "type": "string"
                     },
-                    "size": {
-                        "type": "integer",
-                        "description": "File size in bytes."
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.output_text.done",
+                        "default": "response.output_text.done"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "bucket",
-                    "key",
-                    "mime_type",
-                    "size"
+                    "content_index",
+                    "text",
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
                 ],
-                "title": "CreateUploadSessionRequest"
+                "title": "OpenAIResponseObjectStreamResponseOutputTextDone"
             },
-            "FileUploadResponse": {
+            "OpenAIResponseObjectStreamResponseWebSearchCallCompleted": {
                 "type": "object",
                 "properties": {
-                    "id": {
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
                         "type": "string",
-                        "description": "ID of the upload session"
-                    },
-                    "url": {
-                        "type": "string",
-                        "description": "Upload URL for the file or file parts"
-                    },
-                    "offset": {
-                        "type": "integer",
-                        "description": "Upload content offset"
-                    },
-                    "size": {
-                        "type": "integer",
-                        "description": "Upload content size"
+                        "const": "response.web_search_call.completed",
+                        "default": "response.web_search_call.completed"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "id",
-                    "url",
-                    "offset",
-                    "size"
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
                 ],
-                "title": "FileUploadResponse",
-                "description": "Response after initiating a file upload session."
+                "title": "OpenAIResponseObjectStreamResponseWebSearchCallCompleted"
+            },
+            "OpenAIResponseObjectStreamResponseWebSearchCallInProgress": {
+                "type": "object",
+                "properties": {
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.web_search_call.in_progress",
+                        "default": "response.web_search_call.in_progress"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseWebSearchCallInProgress"
+            },
+            "OpenAIResponseObjectStreamResponseWebSearchCallSearching": {
+                "type": "object",
+                "properties": {
+                    "item_id": {
+                        "type": "string"
+                    },
+                    "output_index": {
+                        "type": "integer"
+                    },
+                    "sequence_number": {
+                        "type": "integer"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.web_search_call.searching",
+                        "default": "response.web_search_call.searching"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "item_id",
+                    "output_index",
+                    "sequence_number",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching"
             },
             "EmbeddingsRequest": {
                 "type": "object",
@@ -8951,46 +9433,6 @@
                 "title": "URIDataSource",
                 "description": "A dataset that can be obtained from a URI."
             },
-            "FileResponse": {
-                "type": "object",
-                "properties": {
-                    "bucket": {
-                        "type": "string",
-                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
-                    },
-                    "key": {
-                        "type": "string",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
-                    },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "MIME type of the file"
-                    },
-                    "url": {
-                        "type": "string",
-                        "description": "Upload URL for the file contents"
-                    },
-                    "bytes": {
-                        "type": "integer",
-                        "description": "Size of the file in bytes"
-                    },
-                    "created_at": {
-                        "type": "integer",
-                        "description": "Timestamp of when the file was created"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "bucket",
-                    "key",
-                    "mime_type",
-                    "url",
-                    "bytes",
-                    "created_at"
-                ],
-                "title": "FileResponse",
-                "description": "Response representing a file entry."
-            },
             "Model": {
                 "type": "object",
                 "properties": {
@@ -10020,7 +10462,8 @@
                             "type": "object",
                             "properties": {
                                 "content": {
-                                    "$ref": "#/components/schemas/InterleavedContent"
+                                    "$ref": "#/components/schemas/InterleavedContent",
+                                    "description": "The content of the chunk, which can be interleaved text, images, or other types."
                                 },
                                 "metadata": {
                                     "type": "object",
@@ -10045,7 +10488,15 @@
                                                 "type": "object"
                                             }
                                         ]
-                                    }
+                                    },
+                                    "description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
+                                },
+                                "embedding": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "number"
+                                    },
+                                    "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                                 }
                             },
                             "additionalProperties": false,
@@ -10053,9 +10504,10 @@
                                 "content",
                                 "metadata"
                             ],
-                            "title": "Chunk"
+                            "title": "Chunk",
+                            "description": "A chunk of content that can be inserted into a vector database."
                         },
-                        "description": "The chunks to insert."
+                        "description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later."
                     },
                     "ttl_seconds": {
                         "type": "integer",
@@ -10294,37 +10746,6 @@
                 ],
                 "title": "Job"
             },
-            "BucketResponse": {
-                "type": "object",
-                "properties": {
-                    "name": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "name"
-                ],
-                "title": "BucketResponse"
-            },
-            "ListBucketResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/BucketResponse"
-                        },
-                        "description": "List of FileResponse entries"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ],
-                "title": "ListBucketResponse",
-                "description": "Response representing a list of file entries."
-            },
             "ListBenchmarksResponse": {
                 "type": "object",
                 "properties": {
@@ -10442,24 +10863,6 @@
                 ],
                 "title": "ListDatasetsResponse"
             },
-            "ListFileResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/FileResponse"
-                        },
-                        "description": "List of FileResponse entries"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ],
-                "title": "ListFileResponse",
-                "description": "Response representing a list of file entries."
-            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
@@ -10571,6 +10974,9 @@
                     "temperature": {
                         "type": "number"
                     },
+                    "text": {
+                        "$ref": "#/components/schemas/OpenAIResponseText"
+                    },
                     "top_p": {
                         "type": "number"
                     },
@@ -10596,6 +11002,7 @@
                     "output",
                     "parallel_tool_calls",
                     "status",
+                    "text",
                     "input"
                 ],
                 "title": "OpenAIResponseObjectWithInput"
@@ -11767,6 +12174,261 @@
                 "title": "OpenAICompletionChoice",
                 "description": "A choice from an OpenAI-compatible completion response."
             },
+            "OpenAIFileDeleteResponse": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string",
+                        "description": "The file identifier that was deleted"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "file",
+                        "default": "file",
+                        "description": "The object type, which is always \"file\""
+                    },
+                    "deleted": {
+                        "type": "boolean",
+                        "description": "Whether the file was successfully deleted"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "object",
+                    "deleted"
+                ],
+                "title": "OpenAIFileDeleteResponse",
+                "description": "Response for deleting a file in OpenAI Files API."
+            },
+            "OpenaiEmbeddingsRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
+                    },
+                    "input": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        ],
+                        "description": "Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings."
+                    },
+                    "encoding_format": {
+                        "type": "string",
+                        "description": "(Optional) The format to return the embeddings in. Can be either \"float\" or \"base64\". Defaults to \"float\"."
+                    },
+                    "dimensions": {
+                        "type": "integer",
+                        "description": "(Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models."
+                    },
+                    "user": {
+                        "type": "string",
+                        "description": "(Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "input"
+                ],
+                "title": "OpenaiEmbeddingsRequest"
+            },
+            "OpenAIEmbeddingData": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "embedding",
+                        "default": "embedding",
+                        "description": "The object type, which will be \"embedding\""
+                    },
+                    "embedding": {
+                        "oneOf": [
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "number"
+                                }
+                            },
+                            {
+                                "type": "string"
+                            }
+                        ],
+                        "description": "The embedding vector as a list of floats (when encoding_format=\"float\") or as a base64-encoded string (when encoding_format=\"base64\")"
+                    },
+                    "index": {
+                        "type": "integer",
+                        "description": "The index of the embedding in the input list"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "embedding",
+                    "index"
+                ],
+                "title": "OpenAIEmbeddingData",
+                "description": "A single embedding data object from an OpenAI-compatible embeddings response."
+            },
+            "OpenAIEmbeddingUsage": {
+                "type": "object",
+                "properties": {
+                    "prompt_tokens": {
+                        "type": "integer",
+                        "description": "The number of tokens in the input"
+                    },
+                    "total_tokens": {
+                        "type": "integer",
+                        "description": "The total number of tokens used"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "prompt_tokens",
+                    "total_tokens"
+                ],
+                "title": "OpenAIEmbeddingUsage",
+                "description": "Usage information for an OpenAI-compatible embeddings response."
+            },
+            "OpenAIEmbeddingsResponse": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list",
+                        "description": "The object type, which will be \"list\""
+                    },
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIEmbeddingData"
+                        },
+                        "description": "List of embedding data objects"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The model that was used to generate the embeddings"
+                    },
+                    "usage": {
+                        "$ref": "#/components/schemas/OpenAIEmbeddingUsage",
+                        "description": "Usage information"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "data",
+                    "model",
+                    "usage"
+                ],
+                "title": "OpenAIEmbeddingsResponse",
+                "description": "Response from an OpenAI-compatible embeddings request."
+            },
+            "OpenAIFilePurpose": {
+                "type": "string",
+                "enum": [
+                    "assistants"
+                ],
+                "title": "OpenAIFilePurpose",
+                "description": "Valid purpose values for OpenAI Files API."
+            },
+            "ListOpenAIFileResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIFileObject"
+                        },
+                        "description": "List of file objects"
+                    },
+                    "has_more": {
+                        "type": "boolean"
+                    },
+                    "first_id": {
+                        "type": "string"
+                    },
+                    "last_id": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list",
+                        "description": "The object type, which is always \"list\""
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data",
+                    "has_more",
+                    "first_id",
+                    "last_id",
+                    "object"
+                ],
+                "title": "ListOpenAIFileResponse",
+                "description": "Response for listing files in OpenAI Files API."
+            },
+            "OpenAIFileObject": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "file",
+                        "default": "file",
+                        "description": "The object type, which is always \"file\""
+                    },
+                    "id": {
+                        "type": "string",
+                        "description": "The file identifier, which can be referenced in the API endpoints"
+                    },
+                    "bytes": {
+                        "type": "integer",
+                        "description": "The size of the file, in bytes"
+                    },
+                    "created_at": {
+                        "type": "integer",
+                        "description": "The Unix timestamp (in seconds) for when the file was created"
+                    },
+                    "expires_at": {
+                        "type": "integer",
+                        "description": "The Unix timestamp (in seconds) for when the file expires"
+                    },
+                    "filename": {
+                        "type": "string",
+                        "description": "The name of the file"
+                    },
+                    "purpose": {
+                        "type": "string",
+                        "enum": [
+                            "assistants"
+                        ],
+                        "description": "The intended purpose of the file"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "id",
+                    "bytes",
+                    "created_at",
+                    "expires_at",
+                    "filename",
+                    "purpose"
+                ],
+                "title": "OpenAIFileObject",
+                "description": "OpenAI File object as defined in the OpenAI Files API."
+            },
             "OpenAIModel": {
                 "type": "object",
                 "properties": {
@@ -11811,6 +12473,10 @@
                 ],
                 "title": "OpenAIListModelsResponse"
             },
+            "Response": {
+                "type": "object",
+                "title": "Response"
+            },
             "DPOAlignmentConfig": {
                 "type": "object",
                 "properties": {
@@ -12285,7 +12951,8 @@
                             "type": "object",
                             "properties": {
                                 "content": {
-                                    "$ref": "#/components/schemas/InterleavedContent"
+                                    "$ref": "#/components/schemas/InterleavedContent",
+                                    "description": "The content of the chunk, which can be interleaved text, images, or other types."
                                 },
                                 "metadata": {
                                     "type": "object",
@@ -12310,7 +12977,15 @@
                                                 "type": "object"
                                             }
                                         ]
-                                    }
+                                    },
+                                    "description": "Metadata associated with the chunk, such as document ID, source, or other relevant information."
+                                },
+                                "embedding": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "number"
+                                    },
+                                    "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                                 }
                             },
                             "additionalProperties": false,
@@ -12318,7 +12993,8 @@
                                 "content",
                                 "metadata"
                             ],
-                            "title": "Chunk"
+                            "title": "Chunk",
+                            "description": "A chunk of content that can be inserted into a vector database."
                         }
                     },
                     "scores": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1afe870cf..ef6b5d70a 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -427,64 +427,6 @@ paths:
             schema:
               $ref: '#/components/schemas/CreateOpenaiResponseRequest'
         required: true
-  /v1/files:
-    get:
-      responses:
-        '200':
-          description: A ListBucketResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBucketResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: List all buckets.
-      parameters:
-        - name: bucket
-          in: query
-          description: 'Bucket name (valid chars: a-zA-Z0-9_-).'
-          required: true
-          schema:
-            type: string
-    post:
-      responses:
-        '200':
-          description: A FileUploadResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/FileUploadResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: >-
-        Create a new upload session for a file identified by a bucket and key.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CreateUploadSessionRequest'
-        required: true
   /v1/agents/{agent_id}:
     get:
       responses:
@@ -616,75 +558,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/files/{bucket}/{key}:
-    get:
-      responses:
-        '200':
-          description: A FileResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/FileResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: >-
-        Get a file info identified by a bucket and key.
-      parameters:
-        - name: bucket
-          in: path
-          description: 'Bucket name (valid chars: a-zA-Z0-9_-).'
-          required: true
-          schema:
-            type: string
-        - name: key
-          in: path
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
-          required: true
-          schema:
-            type: string
-    delete:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: >-
-        Delete a file identified by a bucket and key.
-      parameters:
-        - name: bucket
-          in: path
-          description: 'Bucket name (valid chars: a-zA-Z0-9_-).'
-          required: true
-          schema:
-            type: string
-        - name: key
-          in: path
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
-          required: true
-          schema:
-            type: string
   /v1/inference/embeddings:
     post:
       responses:
@@ -1363,76 +1236,6 @@ paths:
         - PostTraining (Coming Soon)
       description: Get all training jobs.
       parameters: []
-  /v1/files/session:{upload_id}:
-    get:
-      responses:
-        '200':
-          description: A FileUploadResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/FileUploadResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: >-
-        Returns information about an existsing upload session.
-      parameters:
-        - name: upload_id
-          in: path
-          description: ID of the upload session.
-          required: true
-          schema:
-            type: string
-    post:
-      responses:
-        '200':
-          description: >-
-            A FileResponse or None if the upload is not complete.
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/FileResponse'
-                  - type: 'null'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: >-
-        Upload file content to an existing upload session. On the server, request
-        body will have the raw bytes that are uploaded.
-      parameters:
-        - name: upload_id
-          in: path
-          description: ID of the upload session.
-          required: true
-          schema:
-            type: string
-      requestBody:
-        content:
-          application/octet-stream:
-            schema:
-              type: string
-              format: binary
-        required: true
   /v1/vector-dbs/{vector_db_id}:
     get:
       responses:
@@ -2005,35 +1808,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
-  /v1/files/{bucket}:
-    get:
-      responses:
-        '200':
-          description: A ListFileResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListFileResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Files
-      description: List all files in a bucket.
-      parameters:
-        - name: bucket
-          in: path
-          description: 'Bucket name (valid chars: a-zA-Z0-9_-).'
-          required: true
-          schema:
-            type: string
   /v1/models:
     get:
       responses:
@@ -2520,6 +2294,203 @@ paths:
             schema:
               $ref: '#/components/schemas/OpenaiCompletionRequest'
         required: true
+  /v1/openai/v1/files/{file_id}:
+    get:
+      responses:
+        '200':
+          description: >-
+            An OpenAIFileObject containing file information.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIFileObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Files
+      description: >-
+        Returns information about a specific file.
+      parameters:
+        - name: file_id
+          in: path
+          description: >-
+            The ID of the file to use for this request.
+          required: true
+          schema:
+            type: string
+    delete:
+      responses:
+        '200':
+          description: >-
+            An OpenAIFileDeleteResponse indicating successful deletion.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIFileDeleteResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Files
+      description: Delete a file.
+      parameters:
+        - name: file_id
+          in: path
+          description: >-
+            The ID of the file to use for this request.
+          required: true
+          schema:
+            type: string
+  /v1/openai/v1/embeddings:
+    post:
+      responses:
+        '200':
+          description: >-
+            An OpenAIEmbeddingsResponse containing the embeddings.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIEmbeddingsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: >-
+        Generate OpenAI-compatible embeddings for the given input using the specified
+        model.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/OpenaiEmbeddingsRequest'
+        required: true
+  /v1/openai/v1/files:
+    get:
+      responses:
+        '200':
+          description: >-
+            An ListOpenAIFileResponse containing the list of files.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListOpenAIFileResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Files
+      description: >-
+        Returns a list of files that belong to the user's organization.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            A cursor for use in pagination. `after` is an object ID that defines your
+            place in the list. For instance, if you make a list request and receive
+            100 objects, ending with obj_foo, your subsequent call can include after=obj_foo
+            in order to fetch the next page of the list.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            A limit on the number of objects to be returned. Limit can range between
+            1 and 10,000, and the default is 10,000.
+          required: false
+          schema:
+            type: integer
+        - name: order
+          in: query
+          description: >-
+            Sort order by the `created_at` timestamp of the objects. `asc` for ascending
+            order and `desc` for descending order.
+          required: false
+          schema:
+            $ref: '#/components/schemas/Order'
+        - name: purpose
+          in: query
+          description: >-
+            Only return files with the given purpose.
+          required: false
+          schema:
+            $ref: '#/components/schemas/OpenAIFilePurpose'
+    post:
+      responses:
+        '200':
+          description: >-
+            An OpenAIFileObject representing the uploaded file.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIFileObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Files
+      description: >-
+        Upload a file that can be used across various endpoints.
+
+        The file upload should be a multipart form request with:
+
+        - file: The File object (not file name) to be uploaded.
+
+        - purpose: The intended purpose of the uploaded file.
+      parameters: []
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              type: object
+              properties:
+                file:
+                  type: string
+                  format: binary
+                purpose:
+                  $ref: '#/components/schemas/OpenAIFilePurpose'
+              required:
+                - file
+                - purpose
+        required: true
   /v1/openai/v1/models:
     get:
       responses:
@@ -2543,6 +2514,38 @@ paths:
         - Models
       description: List models using the OpenAI API.
       parameters: []
+  /v1/openai/v1/files/{file_id}/content:
+    get:
+      responses:
+        '200':
+          description: >-
+            The raw file content as a binary response.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Response'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Files
+      description: >-
+        Returns the contents of the specified file.
+      parameters:
+        - name: file_id
+          in: path
+          description: >-
+            The ID of the file to use for this request.
+          required: true
+          schema:
+            type: string
   /v1/post-training/preference-optimize:
     post:
       responses:
@@ -5115,6 +5118,57 @@ components:
         - type
       title: >-
         OpenAIResponseOutputMessageWebSearchToolCall
+    OpenAIResponseText:
+      type: object
+      properties:
+        format:
+          type: object
+          properties:
+            type:
+              oneOf:
+                - type: string
+                  const: text
+                - type: string
+                  const: json_schema
+                - type: string
+                  const: json_object
+              description: >-
+                Must be "text", "json_schema", or "json_object" to identify the format
+                type
+            name:
+              type: string
+              description: >-
+                The name of the response format. Only used for json_schema.
+            schema:
+              type: object
+              additionalProperties:
+                oneOf:
+                  - type: 'null'
+                  - type: boolean
+                  - type: number
+                  - type: string
+                  - type: array
+                  - type: object
+              description: >-
+                The JSON schema the response should conform to. In a Python SDK, this
+                is often a `pydantic` model. Only used for json_schema.
+            description:
+              type: string
+              description: >-
+                (Optional) A description of the response format. Only used for json_schema.
+            strict:
+              type: boolean
+              description: >-
+                (Optional) Whether to strictly enforce the JSON schema. If true, the
+                response must match the schema exactly. Only used for json_schema.
+          additionalProperties: false
+          required:
+            - type
+          title: OpenAIResponseTextFormat
+          description: >-
+            Configuration for Responses API text format.
+      additionalProperties: false
+      title: OpenAIResponseText
     CreateOpenaiResponseRequest:
       type: object
       properties:
@@ -5142,10 +5196,14 @@ components:
           type: boolean
         temperature:
           type: number
+        text:
+          $ref: '#/components/schemas/OpenAIResponseText'
         tools:
           type: array
           items:
             $ref: '#/components/schemas/OpenAIResponseInputTool'
+        max_infer_iters:
+          type: integer
       additionalProperties: false
       required:
         - input
@@ -5191,6 +5249,8 @@ components:
           type: string
         temperature:
           type: number
+        text:
+          $ref: '#/components/schemas/OpenAIResponseText'
         top_p:
           type: number
         truncation:
@@ -5206,6 +5266,7 @@ components:
         - output
         - parallel_tool_calls
         - status
+        - text
       title: OpenAIResponseObject
     OpenAIResponseOutput:
       oneOf:
@@ -5294,13 +5355,45 @@ components:
     OpenAIResponseObjectStream:
       oneOf:
         - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemAdded'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemDone'
         - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDone'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallInProgress'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallSearching'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallCompleted'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsInProgress'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsFailed'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsCompleted'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDone'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
         - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
       discriminator:
         propertyName: type
         mapping:
           response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+          response.output_item.added: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemAdded'
+          response.output_item.done: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputItemDone'
           response.output_text.delta: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta'
+          response.output_text.done: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDone'
+          response.function_call_arguments.delta: '#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta'
+          response.function_call_arguments.done: '#/components/schemas/OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone'
+          response.web_search_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallInProgress'
+          response.web_search_call.searching: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallSearching'
+          response.web_search_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseWebSearchCallCompleted'
+          response.mcp_list_tools.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsInProgress'
+          response.mcp_list_tools.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsFailed'
+          response.mcp_list_tools.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpListToolsCompleted'
+          response.mcp_call.arguments.delta: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta'
+          response.mcp_call.arguments.done: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallArgumentsDone'
+          response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
+          response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
+          response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
           response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
     "OpenAIResponseObjectStreamResponseCompleted":
       type: object
@@ -5332,6 +5425,246 @@ components:
         - type
       title: >-
         OpenAIResponseObjectStreamResponseCreated
+    "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta":
+      type: object
+      properties:
+        delta:
+          type: string
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.function_call_arguments.delta
+          default: response.function_call_arguments.delta
+      additionalProperties: false
+      required:
+        - delta
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta
+    "OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone":
+      type: object
+      properties:
+        arguments:
+          type: string
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.function_call_arguments.done
+          default: response.function_call_arguments.done
+      additionalProperties: false
+      required:
+        - arguments
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone
+    "OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta":
+      type: object
+      properties:
+        delta:
+          type: string
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_call.arguments.delta
+          default: response.mcp_call.arguments.delta
+      additionalProperties: false
+      required:
+        - delta
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta
+    "OpenAIResponseObjectStreamResponseMcpCallArgumentsDone":
+      type: object
+      properties:
+        arguments:
+          type: string
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_call.arguments.done
+          default: response.mcp_call.arguments.done
+      additionalProperties: false
+      required:
+        - arguments
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpCallArgumentsDone
+    "OpenAIResponseObjectStreamResponseMcpCallCompleted":
+      type: object
+      properties:
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_call.completed
+          default: response.mcp_call.completed
+      additionalProperties: false
+      required:
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpCallCompleted
+    "OpenAIResponseObjectStreamResponseMcpCallFailed":
+      type: object
+      properties:
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_call.failed
+          default: response.mcp_call.failed
+      additionalProperties: false
+      required:
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpCallFailed
+    "OpenAIResponseObjectStreamResponseMcpCallInProgress":
+      type: object
+      properties:
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_call.in_progress
+          default: response.mcp_call.in_progress
+      additionalProperties: false
+      required:
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpCallInProgress
+    "OpenAIResponseObjectStreamResponseMcpListToolsCompleted":
+      type: object
+      properties:
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_list_tools.completed
+          default: response.mcp_list_tools.completed
+      additionalProperties: false
+      required:
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpListToolsCompleted
+    "OpenAIResponseObjectStreamResponseMcpListToolsFailed":
+      type: object
+      properties:
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_list_tools.failed
+          default: response.mcp_list_tools.failed
+      additionalProperties: false
+      required:
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpListToolsFailed
+    "OpenAIResponseObjectStreamResponseMcpListToolsInProgress":
+      type: object
+      properties:
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.mcp_list_tools.in_progress
+          default: response.mcp_list_tools.in_progress
+      additionalProperties: false
+      required:
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseMcpListToolsInProgress
+    "OpenAIResponseObjectStreamResponseOutputItemAdded":
+      type: object
+      properties:
+        response_id:
+          type: string
+        item:
+          $ref: '#/components/schemas/OpenAIResponseOutput'
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.output_item.added
+          default: response.output_item.added
+      additionalProperties: false
+      required:
+        - response_id
+        - item
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseOutputItemAdded
+    "OpenAIResponseObjectStreamResponseOutputItemDone":
+      type: object
+      properties:
+        response_id:
+          type: string
+        item:
+          $ref: '#/components/schemas/OpenAIResponseOutput'
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.output_item.done
+          default: response.output_item.done
+      additionalProperties: false
+      required:
+        - response_id
+        - item
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseOutputItemDone
     "OpenAIResponseObjectStreamResponseOutputTextDelta":
       type: object
       properties:
@@ -5359,54 +5692,96 @@ components:
         - type
       title: >-
         OpenAIResponseObjectStreamResponseOutputTextDelta
-    CreateUploadSessionRequest:
+    "OpenAIResponseObjectStreamResponseOutputTextDone":
       type: object
       properties:
-        bucket:
-          type: string
-          description: >-
-            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-).
-        key:
-          type: string
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
-        mime_type:
-          type: string
-          description: MIME type of the file.
-        size:
+        content_index:
           type: integer
-          description: File size in bytes.
+        text:
+          type: string
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.output_text.done
+          default: response.output_text.done
       additionalProperties: false
       required:
-        - bucket
-        - key
-        - mime_type
-        - size
-      title: CreateUploadSessionRequest
-    FileUploadResponse:
+        - content_index
+        - text
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseOutputTextDone
+    "OpenAIResponseObjectStreamResponseWebSearchCallCompleted":
       type: object
       properties:
-        id:
+        item_id:
           type: string
-          description: ID of the upload session
-        url:
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
           type: string
-          description: Upload URL for the file or file parts
-        offset:
-          type: integer
-          description: Upload content offset
-        size:
-          type: integer
-          description: Upload content size
+          const: response.web_search_call.completed
+          default: response.web_search_call.completed
       additionalProperties: false
       required:
-        - id
-        - url
-        - offset
-        - size
-      title: FileUploadResponse
-      description: >-
-        Response after initiating a file upload session.
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseWebSearchCallCompleted
+    "OpenAIResponseObjectStreamResponseWebSearchCallInProgress":
+      type: object
+      properties:
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.web_search_call.in_progress
+          default: response.web_search_call.in_progress
+      additionalProperties: false
+      required:
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseWebSearchCallInProgress
+    "OpenAIResponseObjectStreamResponseWebSearchCallSearching":
+      type: object
+      properties:
+        item_id:
+          type: string
+        output_index:
+          type: integer
+        sequence_number:
+          type: integer
+        type:
+          type: string
+          const: response.web_search_call.searching
+          default: response.web_search_call.searching
+      additionalProperties: false
+      required:
+        - item_id
+        - output_index
+        - sequence_number
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseWebSearchCallSearching
     EmbeddingsRequest:
       type: object
       properties:
@@ -6311,39 +6686,6 @@ components:
       title: URIDataSource
       description: >-
         A dataset that can be obtained from a URI.
-    FileResponse:
-      type: object
-      properties:
-        bucket:
-          type: string
-          description: >-
-            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-        key:
-          type: string
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-        mime_type:
-          type: string
-          description: MIME type of the file
-        url:
-          type: string
-          description: Upload URL for the file contents
-        bytes:
-          type: integer
-          description: Size of the file in bytes
-        created_at:
-          type: integer
-          description: Timestamp of when the file was created
-      additionalProperties: false
-      required:
-        - bucket
-        - key
-        - mime_type
-        - url
-        - bytes
-        - created_at
-      title: FileResponse
-      description: Response representing a file entry.
     Model:
       type: object
       properties:
@@ -7024,6 +7366,9 @@ components:
             properties:
               content:
                 $ref: '#/components/schemas/InterleavedContent'
+                description: >-
+                  The content of the chunk, which can be interleaved text, images,
+                  or other types.
               metadata:
                 type: object
                 additionalProperties:
@@ -7034,12 +7379,29 @@ components:
                     - type: string
                     - type: array
                     - type: object
+                description: >-
+                  Metadata associated with the chunk, such as document ID, source,
+                  or other relevant information.
+              embedding:
+                type: array
+                items:
+                  type: number
+                description: >-
+                  Optional embedding for the chunk. If not provided, it will be computed
+                  later.
             additionalProperties: false
             required:
               - content
               - metadata
             title: Chunk
-          description: The chunks to insert.
+            description: >-
+              A chunk of content that can be inserted into a vector database.
+          description: >-
+            The chunks to insert. Each `Chunk` should contain content which can be
+            interleaved text, images, or other types. `metadata`: `dict[str, Any]`
+            and `embedding`: `List[float]` are optional. If `metadata` is provided,
+            you configure how Llama Stack formats the chunk during generation. If
+            `embedding` is not provided, it will be computed later.
         ttl_seconds:
           type: integer
           description: The time to live of the chunks.
@@ -7175,29 +7537,6 @@ components:
         - job_id
         - status
       title: Job
-    BucketResponse:
-      type: object
-      properties:
-        name:
-          type: string
-      additionalProperties: false
-      required:
-        - name
-      title: BucketResponse
-    ListBucketResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/BucketResponse'
-          description: List of FileResponse entries
-      additionalProperties: false
-      required:
-        - data
-      title: ListBucketResponse
-      description: >-
-        Response representing a list of file entries.
     ListBenchmarksResponse:
       type: object
       properties:
@@ -7287,20 +7626,6 @@ components:
       required:
         - data
       title: ListDatasetsResponse
-    ListFileResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/FileResponse'
-          description: List of FileResponse entries
-      additionalProperties: false
-      required:
-        - data
-      title: ListFileResponse
-      description: >-
-        Response representing a list of file entries.
     ListModelsResponse:
       type: object
       properties:
@@ -7381,6 +7706,8 @@ components:
           type: string
         temperature:
           type: number
+        text:
+          $ref: '#/components/schemas/OpenAIResponseText'
         top_p:
           type: number
         truncation:
@@ -7400,6 +7727,7 @@ components:
         - output
         - parallel_tool_calls
         - status
+        - text
         - input
       title: OpenAIResponseObjectWithInput
     ListProvidersResponse:
@@ -8177,6 +8505,220 @@ components:
       title: OpenAICompletionChoice
       description: >-
         A choice from an OpenAI-compatible completion response.
+    OpenAIFileDeleteResponse:
+      type: object
+      properties:
+        id:
+          type: string
+          description: The file identifier that was deleted
+        object:
+          type: string
+          const: file
+          default: file
+          description: The object type, which is always "file"
+        deleted:
+          type: boolean
+          description: >-
+            Whether the file was successfully deleted
+      additionalProperties: false
+      required:
+        - id
+        - object
+        - deleted
+      title: OpenAIFileDeleteResponse
+      description: >-
+        Response for deleting a file in OpenAI Files API.
+    OpenaiEmbeddingsRequest:
+      type: object
+      properties:
+        model:
+          type: string
+          description: >-
+            The identifier of the model to use. The model must be an embedding model
+            registered with Llama Stack and available via the /models endpoint.
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
+          description: >-
+            Input text to embed, encoded as a string or array of strings. To embed
+            multiple inputs in a single request, pass an array of strings.
+        encoding_format:
+          type: string
+          description: >-
+            (Optional) The format to return the embeddings in. Can be either "float"
+            or "base64". Defaults to "float".
+        dimensions:
+          type: integer
+          description: >-
+            (Optional) The number of dimensions the resulting output embeddings should
+            have. Only supported in text-embedding-3 and later models.
+        user:
+          type: string
+          description: >-
+            (Optional) A unique identifier representing your end-user, which can help
+            OpenAI to monitor and detect abuse.
+      additionalProperties: false
+      required:
+        - model
+        - input
+      title: OpenaiEmbeddingsRequest
+    OpenAIEmbeddingData:
+      type: object
+      properties:
+        object:
+          type: string
+          const: embedding
+          default: embedding
+          description: >-
+            The object type, which will be "embedding"
+        embedding:
+          oneOf:
+            - type: array
+              items:
+                type: number
+            - type: string
+          description: >-
+            The embedding vector as a list of floats (when encoding_format="float")
+            or as a base64-encoded string (when encoding_format="base64")
+        index:
+          type: integer
+          description: >-
+            The index of the embedding in the input list
+      additionalProperties: false
+      required:
+        - object
+        - embedding
+        - index
+      title: OpenAIEmbeddingData
+      description: >-
+        A single embedding data object from an OpenAI-compatible embeddings response.
+    OpenAIEmbeddingUsage:
+      type: object
+      properties:
+        prompt_tokens:
+          type: integer
+          description: The number of tokens in the input
+        total_tokens:
+          type: integer
+          description: The total number of tokens used
+      additionalProperties: false
+      required:
+        - prompt_tokens
+        - total_tokens
+      title: OpenAIEmbeddingUsage
+      description: >-
+        Usage information for an OpenAI-compatible embeddings response.
+    OpenAIEmbeddingsResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+          description: The object type, which will be "list"
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIEmbeddingData'
+          description: List of embedding data objects
+        model:
+          type: string
+          description: >-
+            The model that was used to generate the embeddings
+        usage:
+          $ref: '#/components/schemas/OpenAIEmbeddingUsage'
+          description: Usage information
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - model
+        - usage
+      title: OpenAIEmbeddingsResponse
+      description: >-
+        Response from an OpenAI-compatible embeddings request.
+    OpenAIFilePurpose:
+      type: string
+      enum:
+        - assistants
+      title: OpenAIFilePurpose
+      description: >-
+        Valid purpose values for OpenAI Files API.
+    ListOpenAIFileResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIFileObject'
+          description: List of file objects
+        has_more:
+          type: boolean
+        first_id:
+          type: string
+        last_id:
+          type: string
+        object:
+          type: string
+          const: list
+          default: list
+          description: The object type, which is always "list"
+      additionalProperties: false
+      required:
+        - data
+        - has_more
+        - first_id
+        - last_id
+        - object
+      title: ListOpenAIFileResponse
+      description: >-
+        Response for listing files in OpenAI Files API.
+    OpenAIFileObject:
+      type: object
+      properties:
+        object:
+          type: string
+          const: file
+          default: file
+          description: The object type, which is always "file"
+        id:
+          type: string
+          description: >-
+            The file identifier, which can be referenced in the API endpoints
+        bytes:
+          type: integer
+          description: The size of the file, in bytes
+        created_at:
+          type: integer
+          description: >-
+            The Unix timestamp (in seconds) for when the file was created
+        expires_at:
+          type: integer
+          description: >-
+            The Unix timestamp (in seconds) for when the file expires
+        filename:
+          type: string
+          description: The name of the file
+        purpose:
+          type: string
+          enum:
+            - assistants
+          description: The intended purpose of the file
+      additionalProperties: false
+      required:
+        - object
+        - id
+        - bytes
+        - created_at
+        - expires_at
+        - filename
+        - purpose
+      title: OpenAIFileObject
+      description: >-
+        OpenAI File object as defined in the OpenAI Files API.
     OpenAIModel:
       type: object
       properties:
@@ -8209,6 +8751,9 @@ components:
       required:
         - data
       title: OpenAIListModelsResponse
+    Response:
+      type: object
+      title: Response
     DPOAlignmentConfig:
       type: object
       properties:
@@ -8537,6 +9082,9 @@ components:
             properties:
               content:
                 $ref: '#/components/schemas/InterleavedContent'
+                description: >-
+                  The content of the chunk, which can be interleaved text, images,
+                  or other types.
               metadata:
                 type: object
                 additionalProperties:
@@ -8547,11 +9095,23 @@ components:
                     - type: string
                     - type: array
                     - type: object
+                description: >-
+                  Metadata associated with the chunk, such as document ID, source,
+                  or other relevant information.
+              embedding:
+                type: array
+                items:
+                  type: number
+                description: >-
+                  Optional embedding for the chunk. If not provided, it will be computed
+                  later.
             additionalProperties: false
             required:
               - content
               - metadata
             title: Chunk
+            description: >-
+              A chunk of content that can be inserted into a vector database.
         scores:
           type: array
           items:
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 5b7a685c1..e2c73e33c 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -30,6 +30,9 @@ from llama_stack.strong_typing.schema import (
     Schema,
     SchemaOptions,
 )
+from typing import get_origin, get_args
+from typing import Annotated
+from fastapi import UploadFile
 from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
 
 from .operations import (
@@ -618,6 +621,45 @@ class Generator:
                 },
                 required=True,
             )
+        # data passed in request body as multipart/form-data
+        elif op.multipart_params:
+            builder = ContentBuilder(self.schema_builder)
+            
+            # Create schema properties for multipart form fields
+            properties = {}
+            required_fields = []
+            
+            for name, param_type in op.multipart_params:
+                if get_origin(param_type) is Annotated:
+                    base_type = get_args(param_type)[0]
+                else:
+                    base_type = param_type
+                if base_type is UploadFile:
+                    # File upload
+                    properties[name] = {
+                        "type": "string",
+                        "format": "binary"
+                    }
+                else:
+                    # Form field
+                    properties[name] = self.schema_builder.classdef_to_ref(base_type)
+                
+                required_fields.append(name)
+            
+            multipart_schema = {
+                "type": "object",
+                "properties": properties,
+                "required": required_fields
+            }
+            
+            requestBody = RequestBody(
+                content={
+                    "multipart/form-data": {
+                        "schema": multipart_schema
+                    }
+                },
+                required=True,
+            )
         # data passed in payload as JSON and mapped to request parameters
         elif op.request_params:
             builder = ContentBuilder(self.schema_builder)
diff --git a/docs/openapi_generator/pyopenapi/operations.py b/docs/openapi_generator/pyopenapi/operations.py
index 5c78b9124..045e33848 100644
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@@ -17,6 +17,12 @@ from termcolor import colored
 
 from llama_stack.strong_typing.inspection import get_signature
 
+from typing import get_origin, get_args
+
+from fastapi import UploadFile 
+from fastapi.params import File, Form
+from typing import Annotated
+
 
 def split_prefix(
     s: str, sep: str, prefix: Union[str, Iterable[str]]
@@ -82,6 +88,7 @@ class EndpointOperation:
     :param path_params: Parameters of the operation signature that are passed in the path component of the URL string.
     :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
     :param request_params: The parameter that corresponds to the data transmitted in the request body.
+    :param multipart_params: Parameters that indicate multipart/form-data request body.
     :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
     :param response_type: The Python type of the data that is transmitted in the response body.
     :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
@@ -98,6 +105,7 @@ class EndpointOperation:
     path_params: List[OperationParameter]
     query_params: List[OperationParameter]
     request_params: Optional[OperationParameter]
+    multipart_params: List[OperationParameter]
     event_type: Optional[type]
     response_type: type
     http_method: HTTPMethod
@@ -252,6 +260,7 @@ def get_endpoint_operations(
         path_params = []
         query_params = []
         request_params = []
+        multipart_params = []
 
         for param_name, parameter in signature.parameters.items():
             param_type = _get_annotation_type(parameter.annotation, func_ref)
@@ -266,6 +275,8 @@ def get_endpoint_operations(
                     f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                 )
 
+            is_multipart = _is_multipart_param(param_type)
+            
             if prefix in ["get", "delete"]:
                 if route_params is not None and param_name in route_params:
                     path_params.append((param_name, param_type))
@@ -274,6 +285,8 @@ def get_endpoint_operations(
             else:
                 if route_params is not None and param_name in route_params:
                     path_params.append((param_name, param_type))
+                elif is_multipart:
+                    multipart_params.append((param_name, param_type))
                 else:
                     request_params.append((param_name, param_type))
 
@@ -333,6 +346,7 @@ def get_endpoint_operations(
                 path_params=path_params,
                 query_params=query_params,
                 request_params=request_params,
+                multipart_params=multipart_params,
                 event_type=event_type,
                 response_type=response_type,
                 http_method=http_method,
@@ -377,3 +391,34 @@ def get_endpoint_events(endpoint: type) -> Dict[str, type]:
         results[param_type.__name__] = param_type
 
     return results
+
+
+def _is_multipart_param(param_type: type) -> bool:
+    """
+    Check if a parameter type indicates multipart form data.
+    
+    Returns True if the type is:
+    - UploadFile
+    - Annotated[UploadFile, File()]
+    - Annotated[str, Form()]
+    - Annotated[Any, File()]
+    - Annotated[Any, Form()]
+    """
+    if param_type is UploadFile:
+        return True
+    
+    # Check for Annotated types
+    origin = get_origin(param_type)
+    if origin is None:
+        return False
+    
+    if origin is Annotated:
+        args = get_args(param_type)
+        if len(args) < 2:
+            return False
+        
+        # Check the annotations for File() or Form()
+        for annotation in args[1:]:
+            if isinstance(annotation, (File, Form)):
+                return True
+    return False
diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py
index 12a69050c..7e54c6fbb 100644
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@@ -153,6 +153,12 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
         return "has no return type annotation"
 
     return_type = hints['return']
+    
+    # Allow OpenAI endpoints to return response objects since they follow OpenAI specification
+    method_name = getattr(method, '__name__', '')
+    if method_name.startswith('openai_'):
+        return None
+    
     if return_type is not None and return_type is not type(None):
         return "does not return None where None is mandatory"
 
diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index dbe90a7fc..289c38991 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -57,6 +57,31 @@ chunks = [
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
+
+#### Using Precomputed Embeddings
+If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
+including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
+want to customize the ingestion process.
+```python
+chunks_with_embeddings = [
+    {
+        "content": "First chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "introduction"},
+    },
+    {
+        "content": "Second chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "methodology"},
+    },
+]
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
+```
+When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
+registering the vector database.
+
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
diff --git a/docs/source/building_applications/tools.md b/docs/source/building_applications/tools.md
index c7af17bfa..b19be888c 100644
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@@ -9,29 +9,24 @@ When instantiating an agent, you can provide it a list of tool groups that it ha
 
 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
 
-## Types of Tool Group providers
+## Server-side vs. client-side tool execution
 
-There are three types of providers for tool groups that are supported by Llama Stack.
+Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model
+transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution
+and optional continuation using the `agent.resume_turn` method.
 
-1. Built-in providers
-2. Model Context Protocol (MCP) providers
-3. Client provided tools
 
-### Built-in providers
+### Server-side tools
 
-Built-in providers come packaged with Llama Stack. These providers provide common functionalities like web search, code interpretation, and computational capabilities.
+Llama Stack provides built-in providers for some common tools. These include web search, math, and RAG capabilities.
 
-#### Web Search providers
-There are three web search providers that are supported by Llama Stack.
+#### Web Search
 
-1. Brave Search
-2. Bing Search
-3. Tavily Search
+You have three providers to execute the web search tool calls generated by a model: Brave Search, Bing Search, and Tavily Search.
 
-Example client SDK call to register a "websearch" toolgroup that is provided by brave-search.
+To indicate that the web search tool calls should be executed by brave-search, you can point the "builtin::websearch" toolgroup to the "brave-search" provider.
 
 ```python
-# Register Brave Search tool group
 client.toolgroups.register(
     toolgroup_id="builtin::websearch",
     provider_id="brave-search",
@@ -39,17 +34,17 @@ client.toolgroups.register(
 )
 ```
 
-The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
-
-> **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
+The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is:
+```
+{"<provider_name>_api_key": <your api key>}
+```
 
 
-#### WolframAlpha
+#### Math
 
 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
 
 ```python
-# Register WolframAlpha tool group
 client.toolgroups.register(
     toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
 )
@@ -83,11 +78,49 @@ Features:
 
 > **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 
-## Model Context Protocol (MCP) Tools
+## Model Context Protocol (MCP)
 
-MCP tools are special tools that can interact with llama stack over model context protocol. These tools are dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.
+[MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered
+from an MCP endpoint and can be used to extend the agent's capabilities.
 
-Refer to [https://github.com/modelcontextprotocol/servers](https://github.com/modelcontextprotocol/servers) for available MCP servers.
+
+### Using Remote MCP Servers
+
+You can find some popular remote MCP servers [here](https://github.com/jaw9c/awesome-remote-mcp-servers). You can register them as toolgroups in the same way as local providers.
+
+```python
+client.toolgroups.register(
+    toolgroup_id="mcp::deepwiki",
+    provider_id="model-context-protocol",
+    mcp_endpoint=URL(uri="https://mcp.deepwiki.com/sse"),
+)
+```
+
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server
+using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
+
+```python
+agent = Agent(
+    ...,
+    tools=["mcp::deepwiki"],
+    extra_headers={
+        "X-LlamaStack-Provider-Data": json.dumps(
+            {
+                "mcp_headers": {
+                    "http://mcp.deepwiki.com/sse": {
+                        "Authorization": "Bearer <your_access_token>",
+                    },
+                },
+            }
+        ),
+    },
+)
+agent.create_turn(...)
+```
+
+### Running your own MCP server
+
+Here's an example of how to run a simple MCP server that exposes a File System as a set of tools to the Llama Stack agent.
 
 ```shell
 # start your MCP server
@@ -106,13 +139,9 @@ client.toolgroups.register(
 )
 ```
 
-MCP tools require:
-- A valid MCP endpoint URL
-- The endpoint must implement the Model Context Protocol
-- Tools are discovered dynamically from the endpoint
 
 
-## Adding Custom Tools
+## Adding Custom (Client-side) Tools
 
 When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
 along to the generative model.
diff --git a/docs/source/concepts/api_providers.md b/docs/source/concepts/api_providers.md
new file mode 100644
index 000000000..6e6502c0c
--- /dev/null
+++ b/docs/source/concepts/api_providers.md
@@ -0,0 +1,12 @@
+## API Providers
+
+The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
+- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
+- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
+
+Providers come in two flavors:
+- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
+- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
+
+Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
diff --git a/docs/source/concepts/apis.md b/docs/source/concepts/apis.md
new file mode 100644
index 000000000..38c6a7a73
--- /dev/null
+++ b/docs/source/concepts/apis.md
@@ -0,0 +1,18 @@
+## APIs
+
+A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
+
+- **Inference**: run inference with a LLM
+- **Safety**: apply safety policies to the output at a Systems (not only model) level
+- **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
+- **DatasetIO**: interface with datasets and data loaders
+- **Scoring**: evaluate outputs of the system
+- **Eval**: generate outputs (via Inference or Agents) and perform scoring
+- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
+- **Telemetry**: collect telemetry data from the system
+
+We are working on adding a few more APIs to complete the application lifecycle. These will include:
+- **Batch Inference**: run inference on a dataset of inputs
+- **Batch Agents**: run agents on a dataset of inputs
+- **Post Training**: fine-tune a Llama model
+- **Synthetic Data Generation**: generate synthetic data for model development
diff --git a/docs/source/concepts/distributions.md b/docs/source/concepts/distributions.md
new file mode 100644
index 000000000..c3be12d93
--- /dev/null
+++ b/docs/source/concepts/distributions.md
@@ -0,0 +1,9 @@
+## Distributions
+
+While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
+
+**Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
+
+**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
+
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 14390c0a2..3f03d098f 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -1,4 +1,4 @@
-# Evaluation Concepts
+## Evaluation Concepts
 
 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
 
@@ -10,11 +10,7 @@ We introduce a set of APIs in Llama Stack for supporting running evaluations of
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
 
-## Evaluation Concepts
-
-The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
-
-![Eval Concepts](../references/evals_reference/resources/eval-concept.png)
+The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
 
 - **DatasetIO**: defines interface with datasets and data loaders.
   - Associated with `Dataset` resource.
@@ -24,9 +20,9 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
   - Associated with `Benchmark` resource.
 
 
-## Open-benchmark Eval
+### Open-benchmark Eval
 
-### List of open-benchmarks Llama Stack support
+#### List of open-benchmarks Llama Stack support
 
 Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
 
@@ -39,7 +35,7 @@ The list of open-benchmarks we currently support:
 
 You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
 
-### Run evaluation on open-benchmarks via CLI
+#### Run evaluation on open-benchmarks via CLI
 
 We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
 
@@ -74,7 +70,7 @@ evaluation results over there.
 
 
 
-## What's Next?
+#### What's Next?
 
 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
 - Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index a94511a0d..1c31dc232 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -1,74 +1,23 @@
 # Core Concepts
 
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-evaluation_concepts
-```
-
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
 
-
-## APIs
-
-A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
-
-- **Inference**: run inference with a LLM
-- **Safety**: apply safety policies to the output at a Systems (not only model) level
-- **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
-- **DatasetIO**: interface with datasets and data loaders
-- **Scoring**: evaluate outputs of the system
-- **Eval**: generate outputs (via Inference or Agents) and perform scoring
-- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
-- **Telemetry**: collect telemetry data from the system
-
-We are working on adding a few more APIs to complete the application lifecycle. These will include:
-- **Batch Inference**: run inference on a dataset of inputs
-- **Batch Agents**: run agents on a dataset of inputs
-- **Post Training**: fine-tune a Llama model
-- **Synthetic Data Generation**: generate synthetic data for model development
-
-## API Providers
-
-The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
-- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
-- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
-- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
-
-Providers come in two flavors:
-- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
-- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
-
-Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
-## Resources
-
-Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
-
-- **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
-- **Safety** is associated with `Shield` resources.
-- **Tool Runtime** is associated with `ToolGroup` resources.
-- **DatasetIO** is associated with `Dataset` resources.
-- **VectorIO** is associated with `VectorDB` resources.
-- **Scoring** is associated with `ScoringFunction` resources.
-- **Eval** is associated with `Model` and `Benchmark` resources.
-
-Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
-
-```{admonition} Registering Resources
-:class: tip
-
-Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
+```{include} apis.md
+:start-after: ## APIs
 ```
 
-## Distributions
+```{include} api_providers.md
+:start-after: ## API Providers
+```
 
-While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
+```{include} resources.md
+:start-after: ## Resources
+```
 
-**Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
+```{include} distributions.md
+:start-after: ## Distributions
+```
 
-**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
-
-
-**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
+```{include} evaluation_concepts.md
+:start-after: ## Evaluation Concepts
+```
diff --git a/docs/source/concepts/resources.md b/docs/source/concepts/resources.md
new file mode 100644
index 000000000..0cdc9a227
--- /dev/null
+++ b/docs/source/concepts/resources.md
@@ -0,0 +1,19 @@
+## Resources
+
+Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
+
+- **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
+- **Safety** is associated with `Shield` resources.
+- **Tool Runtime** is associated with `ToolGroup` resources.
+- **DatasetIO** is associated with `Dataset` resources.
+- **VectorIO** is associated with `VectorDB` resources.
+- **Scoring** is associated with `ScoringFunction` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
+
+Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
+
+```{admonition} Registering Resources
+:class: tip
+
+Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
+```
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 0dbabf8aa..521071cc6 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -260,7 +260,41 @@ Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM pyth
 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```
 
-After this step is successful, you should be able to find the built container image and test it with `llama stack run <path/to/run.yaml>`.
+Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
+```
+export INFERENCE_MODEL="llama3.2:3b"
+export LLAMA_STACK_PORT=8321
+mkdir -p ~/.llama
+```
+
+After this step is successful, you should be able to find the built container image and test it with the below Docker command:
+
+```
+docker run -d \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  localhost/distribution-ollama:dev \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
+```
+
+Here are the docker flags and their uses:
+
+* `-d`: Runs the container in the detached mode as a background process
+
+* `-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT`: Maps the container port to the host port for accessing the server
+
+* `-v ~/.llama:/root/.llama`: Mounts the local .llama directory to persist configurations and data
+
+* `localhost/distribution-ollama:dev`: The name and tag of the container image to run
+
+* `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
+
+* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
+
+* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
+
 :::
 
 ::::
diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh
new file mode 100755
index 000000000..7ff7d28eb
--- /dev/null
+++ b/docs/source/distributions/k8s/apply.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+export POSTGRES_USER=${POSTGRES_USER:-llamastack}
+export POSTGRES_DB=${POSTGRES_DB:-llamastack}
+export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
+
+export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
+
+set -euo pipefail
+set -x
+
+envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
+envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
+envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
+envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
+
+kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
+  --dry-run=client -o yaml > stack-configmap.yaml
+
+kubectl apply -f stack-configmap.yaml
+
+envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
+envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
+
+envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
diff --git a/docs/source/distributions/k8s/chroma-k8s.yaml.template b/docs/source/distributions/k8s/chroma-k8s.yaml.template
new file mode 100644
index 000000000..a2a5e3be3
--- /dev/null
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
@@ -0,0 +1,66 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: chromadb-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: chromadb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: chromadb
+  template:
+    metadata:
+      labels:
+        app: chromadb
+    spec:
+      containers:
+      - name: chromadb
+        image: chromadb/chroma:latest
+        ports:
+        - containerPort: 6000
+        env:
+        - name: CHROMA_HOST
+          value: "0.0.0.0"
+        - name: CHROMA_PORT
+          value: "6000"
+        - name: PERSIST_DIRECTORY
+          value: "/chroma/chroma"
+        - name: CHROMA_DB_IMPL
+          value: "duckdb+parquet"
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: chromadb-storage
+          mountPath: /chroma/chroma
+      volumes:
+      - name: chromadb-storage
+        persistentVolumeClaim:
+          claimName: chromadb-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: chromadb
+spec:
+  selector:
+    app: chromadb
+  ports:
+  - protocol: TCP
+    port: 6000
+    targetPort: 6000
+  type: ClusterIP
diff --git a/docs/source/distributions/k8s/ingress-k8s.yaml.template b/docs/source/distributions/k8s/ingress-k8s.yaml.template
new file mode 100644
index 000000000..9ebe86b69
--- /dev/null
+++ b/docs/source/distributions/k8s/ingress-k8s.yaml.template
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-stack-service
+spec:
+  type: LoadBalancer
+  selector:
+    app.kubernetes.io/name: llama-stack
+  ports:
+    - name: llama-stack-api
+      port: 8321
+      targetPort: 8321
+      protocol: TCP
+    - name: llama-stack-ui
+      port: 8322
+      targetPort: 8322
+      protocol: TCP
diff --git a/docs/source/distributions/k8s/postgres-k8s.yaml.template b/docs/source/distributions/k8s/postgres-k8s.yaml.template
new file mode 100644
index 000000000..86a765652
--- /dev/null
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
@@ -0,0 +1,66 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: postgres-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: postgres
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15
+        env:
+        - name: POSTGRES_DB
+          value: "${POSTGRES_DB}"
+        - name: POSTGRES_USER
+          value: "${POSTGRES_USER}"
+        - name: POSTGRES_PASSWORD
+          value: "${POSTGRES_PASSWORD}"
+        - name: PGDATA
+          value: "/var/lib/postgresql/data/pgdata"
+        ports:
+        - containerPort: 5432
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+        volumeMounts:
+        - name: postgres-storage
+          mountPath: /var/lib/postgresql/data
+      volumes:
+      - name: postgres-storage
+        persistentVolumeClaim:
+          claimName: postgres-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres-server
+spec:
+  selector:
+    app.kubernetes.io/name: postgres
+  ports:
+  - protocol: TCP
+    port: 5432
+    targetPort: 5432
+  type: ClusterIP
diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml
new file mode 100644
index 000000000..fa7bacd8f
--- /dev/null
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@@ -0,0 +1,128 @@
+apiVersion: v1
+data:
+  stack_run_config.yaml: |
+    version: '2'
+    image_name: kubernetes-demo
+    apis:
+    - agents
+    - inference
+    - safety
+    - telemetry
+    - tool_runtime
+    - vector_io
+    providers:
+      inference:
+      - provider_id: vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_URL:http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+          api_token: ${env.VLLM_API_TOKEN:fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+      - provider_id: vllm-safety
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+          api_token: ${env.VLLM_API_TOKEN:fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+      - provider_id: sentence-transformers
+        provider_type: inline::sentence-transformers
+        config: {}
+      vector_io:
+      - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+        provider_type: remote::chromadb
+        config:
+          url: ${env.CHROMADB_URL:}
+      safety:
+      - provider_id: llama-guard
+        provider_type: inline::llama-guard
+        config:
+          excluded_categories: []
+      agents:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          persistence_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:localhost}
+            port: ${env.POSTGRES_PORT:5432}
+            db: ${env.POSTGRES_DB:llamastack}
+            user: ${env.POSTGRES_USER:llamastack}
+            password: ${env.POSTGRES_PASSWORD:llamastack}
+          responses_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:localhost}
+            port: ${env.POSTGRES_PORT:5432}
+            db: ${env.POSTGRES_DB:llamastack}
+            user: ${env.POSTGRES_USER:llamastack}
+            password: ${env.POSTGRES_PASSWORD:llamastack}
+      telemetry:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          service_name: ${env.OTEL_SERVICE_NAME:}
+          sinks: ${env.TELEMETRY_SINKS:console}
+      tool_runtime:
+      - provider_id: brave-search
+        provider_type: remote::brave-search
+        config:
+          api_key: ${env.BRAVE_SEARCH_API_KEY:}
+          max_results: 3
+      - provider_id: tavily-search
+        provider_type: remote::tavily-search
+        config:
+          api_key: ${env.TAVILY_SEARCH_API_KEY:}
+          max_results: 3
+      - provider_id: rag-runtime
+        provider_type: inline::rag-runtime
+        config: {}
+      - provider_id: model-context-protocol
+        provider_type: remote::model-context-protocol
+        config: {}
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:localhost}
+      port: ${env.POSTGRES_PORT:5432}
+      db: ${env.POSTGRES_DB:llamastack}
+      user: ${env.POSTGRES_USER:llamastack}
+      password: ${env.POSTGRES_PASSWORD:llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:localhost}
+      port: ${env.POSTGRES_PORT:5432}
+      db: ${env.POSTGRES_DB:llamastack}
+      user: ${env.POSTGRES_USER:llamastack}
+      password: ${env.POSTGRES_PASSWORD:llamastack}
+    models:
+    - metadata:
+        embedding_dimension: 384
+      model_id: all-MiniLM-L6-v2
+      provider_id: sentence-transformers
+      model_type: embedding
+    - metadata: {}
+      model_id: ${env.INFERENCE_MODEL}
+      provider_id: vllm-inference
+      model_type: llm
+    - metadata: {}
+      model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+      provider_id: vllm-safety
+      model_type: llm
+    shields:
+    - shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+    vector_dbs: []
+    datasets: []
+    scoring_fns: []
+    benchmarks: []
+    tool_groups:
+    - toolgroup_id: builtin::websearch
+      provider_id: tavily-search
+    - toolgroup_id: builtin::rag
+      provider_id: rag-runtime
+    server:
+      port: 8321
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  name: llama-stack-config
diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template
new file mode 100644
index 000000000..1cfc63ef5
--- /dev/null
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@@ -0,0 +1,69 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+      app.kubernetes.io/component: server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+        app.kubernetes.io/component: server
+    spec:
+      containers:
+      - name: llama-stack
+        image: llamastack/distribution-remote-vllm:latest
+        imagePullPolicy: Always # since we have specified latest instead of a version
+        env:
+        - name: ENABLE_CHROMADB
+          value: "true"
+        - name: CHROMADB_URL
+          value: http://chromadb.default.svc.cluster.local:6000
+        - name: VLLM_URL
+          value: http://vllm-server.default.svc.cluster.local:8000/v1
+        - name: VLLM_MAX_TOKENS
+          value: "3072"
+        - name: VLLM_SAFETY_URL
+          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
+        - name: POSTGRES_HOST
+          value: postgres-server.default.svc.cluster.local
+        - name: POSTGRES_PORT
+          value: "5432"
+        - name: VLLM_TLS_VERIFY
+          value: "false"
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
+        - name: SAFETY_MODEL
+          value: "${SAFETY_MODEL}"
+        - name: TAVILY_SEARCH_API_KEY
+          value: "${TAVILY_SEARCH_API_KEY}"
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        ports:
+          - containerPort: 8321
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.llama
+          - name: llama-config
+            mountPath: /etc/config
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: llama-pvc
+      - name: llama-config
+        configMap:
+          name: llama-stack-config
diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml
new file mode 100644
index 000000000..8e2773dd1
--- /dev/null
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@@ -0,0 +1,121 @@
+version: '2'
+image_name: kubernetes-demo
+apis:
+- agents
+- inference
+- safety
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: vllm-safety
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+models:
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+  provider_id: vllm-safety
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
diff --git a/docs/source/distributions/k8s/ui-k8s.yaml.template b/docs/source/distributions/k8s/ui-k8s.yaml.template
new file mode 100644
index 000000000..ef1bf0c55
--- /dev/null
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@@ -0,0 +1,62 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-ui
+  labels:
+    app.kubernetes.io/name: llama-stack
+    app.kubernetes.io/component: ui
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+      app.kubernetes.io/component: ui
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+        app.kubernetes.io/component: ui
+    spec:
+      containers:
+      - name: llama-stack-ui
+        image: node:18-alpine
+        command: ["/bin/sh"]
+        env:
+        - name: LLAMA_STACK_BACKEND_URL
+          value: "http://llama-stack-service:8321"
+        - name: LLAMA_STACK_UI_PORT
+          value: "8322"
+        args:
+          - -c
+          - |
+            # Install git (not included in alpine by default)
+            apk add --no-cache git
+
+            # Clone the repository
+            echo "Cloning repository..."
+            git clone https://github.com/meta-llama/llama-stack.git /app
+
+            # Navigate to the UI directory
+            echo "Navigating to UI directory..."
+            cd /app/llama_stack/ui
+
+            # Check if package.json exists
+            if [ ! -f "package.json" ]; then
+              echo "ERROR: package.json not found in $(pwd)"
+              ls -la
+              exit 1
+            fi
+
+            # Install dependencies with verbose output
+            echo "Installing dependencies..."
+            npm install --verbose
+
+            # Verify next is installed
+            echo "Checking if next is installed..."
+            npx next --version || echo "Next.js not found, checking node_modules..."
+            ls -la node_modules/.bin/ | grep next || echo "No next binary found"
+
+            npm run dev
+        ports:
+        - containerPort: 8322
+        workingDir: /app
diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template
new file mode 100644
index 000000000..6256cc7e1
--- /dev/null
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@@ -0,0 +1,71 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+        workload-type: inference
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - inference
+            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args:
+        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
new file mode 100644
index 000000000..8857e83b6
--- /dev/null
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@@ -0,0 +1,73 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models-safety
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  storageClassName: gp2
+  resources:
+    requests:
+      storage: 30Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server-safety
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm-safety
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm-safety
+        workload-type: inference
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: workload-type
+                operator: In
+                values:
+                - inference
+            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      containers:
+      - name: vllm-safety
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8001
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models-safety
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server-safety
+spec:
+  selector:
+    app.kubernetes.io/name: vllm-safety
+  ports:
+  - protocol: TCP
+    port: 8001
+    targetPort: 8001
+  type: ClusterIP
diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index d36e94748..e09666e13 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -18,6 +18,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
+| files | `inline::localfs` |
 | inference | `remote::fireworks`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index e084f68b7..ee7cdd4a9 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -82,7 +82,7 @@ for log in AgentEventLogger().log(response):
 ```
 We will use `uv` to run the script
 ```
-uv run --with llama-stack-client demo_script.py
+uv run --with llama-stack-client,fire,requests demo_script.py
 ```
 And you should see output like below.
 ```
diff --git a/docs/source/index.md b/docs/source/index.md
index 0c2d5a015..1df5e8507 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -103,6 +103,7 @@ getting_started/index
 getting_started/detailed_tutorial
 introduction/index
 concepts/index
+openai/index
 providers/index
 distributions/index
 building_applications/index
diff --git a/docs/source/openai/index.md b/docs/source/openai/index.md
new file mode 100644
index 000000000..03a969cc5
--- /dev/null
+++ b/docs/source/openai/index.md
@@ -0,0 +1,193 @@
+# OpenAI API Compatibility
+
+## Server path
+
+Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.
+
+## Clients
+
+You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.
+
+### Llama Stack Client
+
+When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:8321")
+```
+
+### OpenAI Client
+
+When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")
+```
+
+Regardless of the client you choose, the following code examples should all work the same.
+
+## APIs implemented
+
+### Models
+
+Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:
+
+```python
+models = client.models.list()
+```
+
+### Responses
+
+:::{note}
+The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
+:::
+
+#### Simple inference
+
+Request:
+
+```
+response = client.responses.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    input="Write a haiku about coding."
+)
+
+print(response.output_text)
+```
+Example output:
+
+```text
+Pixels dancing slow
+Syntax whispers secrets sweet
+Code's gentle silence
+```
+
+#### Structured Output
+
+Request:
+
+```python
+response = client.responses.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    input=[
+        {
+            "role": "system",
+            "content": "Extract the participants from the event information.",
+        },
+        {
+            "role": "user",
+            "content": "Alice and Bob are going to a science fair on Friday.",
+        },
+    ],
+    text={
+        "format": {
+            "type": "json_schema",
+            "name": "participants",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "participants": {"type": "array", "items": {"type": "string"}}
+                },
+                "required": ["participants"],
+            },
+        }
+    },
+)
+print(response.output_text)
+```
+
+Example output:
+
+```text
+{ "participants": ["Alice", "Bob"] }
+```
+
+### Chat Completions
+
+#### Simple inference
+
+Request:
+
+```python
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[{"role": "user", "content": "Write a haiku about coding."}],
+)
+
+print(chat_completion.choices[0].message.content)
+```
+
+Example output:
+
+```text
+Lines of code unfold
+Logic flows like a river
+Code's gentle beauty
+```
+
+#### Structured Output
+
+Request:
+
+```python
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct",
+    messages=[
+        {
+            "role": "system",
+            "content": "Extract the participants from the event information.",
+        },
+        {
+            "role": "user",
+            "content": "Alice and Bob are going to a science fair on Friday.",
+        },
+    ],
+    response_format={
+        "type": "json_schema",
+        "json_schema": {
+            "name": "participants",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "participants": {"type": "array", "items": {"type": "string"}}
+                },
+                "required": ["participants"],
+            },
+        },
+    },
+)
+
+print(chat_completion.choices[0].message.content)
+```
+
+Example output:
+
+```text
+{ "participants": ["Alice", "Bob"] }
+```
+
+### Completions
+
+#### Simple inference
+
+Request:
+
+```python
+completion = client.completions.create(
+    model="meta-llama/Llama-3.2-3B-Instruct", prompt="Write a haiku about coding."
+)
+
+print(completion.choices[0].text)
+```
+
+Example output:
+
+```text
+Lines of code unfurl
+Logic whispers in the dark
+Art in hidden form
+```
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index b79c512b8..cc4ee0648 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -37,6 +37,7 @@ from .openai_responses import (
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
+    OpenAIResponseText,
 )
 
 # TODO: use enum.StrEnum when we drop support for python 3.10
@@ -603,7 +604,9 @@ class Agents(Protocol):
         store: bool | None = True,
         stream: bool | None = False,
         temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
         tools: list[OpenAIResponseInputTool] | None = None,
+        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a new OpenAI response.
 
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
index 6806e1d3f..35b3d5ace 100644
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -7,6 +7,7 @@
 from typing import Annotated, Any, Literal
 
 from pydantic import BaseModel, Field
+from typing_extensions import TypedDict
 
 from llama_stack.schema_utils import json_schema_type, register_schema
 
@@ -126,6 +127,32 @@ OpenAIResponseOutput = Annotated[
 register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
 
 
+# This has to be a TypedDict because we need a "schema" field and our strong
+# typing code in the schema generator doesn't support Pydantic aliases. That also
+# means we can't use a discriminator field here, because TypedDicts don't support
+# default values which the strong typing code requires for discriminators.
+class OpenAIResponseTextFormat(TypedDict, total=False):
+    """Configuration for Responses API text format.
+
+    :param type: Must be "text", "json_schema", or "json_object" to identify the format type
+    :param name: The name of the response format. Only used for json_schema.
+    :param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema.
+    :param description: (Optional) A description of the response format. Only used for json_schema.
+    :param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema.
+    """
+
+    type: Literal["text"] | Literal["json_schema"] | Literal["json_object"]
+    name: str | None
+    schema: dict[str, Any] | None
+    description: str | None
+    strict: bool | None
+
+
+@json_schema_type
+class OpenAIResponseText(BaseModel):
+    format: OpenAIResponseTextFormat | None = None
+
+
 @json_schema_type
 class OpenAIResponseObject(BaseModel):
     created_at: int
@@ -138,6 +165,9 @@ class OpenAIResponseObject(BaseModel):
     previous_response_id: str | None = None
     status: str
     temperature: float | None = None
+    # Default to text format to avoid breaking the loading of old responses
+    # before the field was added. New responses will have this set always.
+    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
     top_p: float | None = None
     truncation: str | None = None
     user: str | None = None
@@ -149,6 +179,30 @@ class OpenAIResponseObjectStreamResponseCreated(BaseModel):
     type: Literal["response.created"] = "response.created"
 
 
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.completed"] = "response.completed"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
+    response_id: str
+    item: OpenAIResponseOutput
+    output_index: int
+    sequence_number: int
+    type: Literal["response.output_item.added"] = "response.output_item.added"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
+    response_id: str
+    item: OpenAIResponseOutput
+    output_index: int
+    sequence_number: int
+    type: Literal["response.output_item.done"] = "response.output_item.done"
+
+
 @json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
     content_index: int
@@ -160,14 +214,132 @@ class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
 
 
 @json_schema_type
-class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
-    response: OpenAIResponseObject
-    type: Literal["response.completed"] = "response.completed"
+class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
+    content_index: int
+    text: str  # final text of the output item
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.output_text.done"] = "response.output_text.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.function_call_arguments.delta"] = "response.function_call_arguments.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
+    arguments: str  # final arguments of the function call
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.function_call_arguments.done"] = "response.function_call_arguments.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseWebSearchCallInProgress(BaseModel):
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.web_search_call.in_progress"] = "response.web_search_call.in_progress"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseWebSearchCallSearching(BaseModel):
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.web_search_call.searching"] = "response.web_search_call.searching"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseWebSearchCallCompleted(BaseModel):
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.web_search_call.completed"] = "response.web_search_call.completed"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpListToolsInProgress(BaseModel):
+    sequence_number: int
+    type: Literal["response.mcp_list_tools.in_progress"] = "response.mcp_list_tools.in_progress"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpListToolsFailed(BaseModel):
+    sequence_number: int
+    type: Literal["response.mcp_list_tools.failed"] = "response.mcp_list_tools.failed"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpListToolsCompleted(BaseModel):
+    sequence_number: int
+    type: Literal["response.mcp_list_tools.completed"] = "response.mcp_list_tools.completed"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta(BaseModel):
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.mcp_call.arguments.delta"] = "response.mcp_call.arguments.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpCallArgumentsDone(BaseModel):
+    arguments: str  # final arguments of the MCP call
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.mcp_call.arguments.done"] = "response.mcp_call.arguments.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.mcp_call.in_progress"] = "response.mcp_call.in_progress"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpCallFailed(BaseModel):
+    sequence_number: int
+    type: Literal["response.mcp_call.failed"] = "response.mcp_call.failed"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
+    sequence_number: int
+    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
 
 
 OpenAIResponseObjectStream = Annotated[
     OpenAIResponseObjectStreamResponseCreated
+    | OpenAIResponseObjectStreamResponseOutputItemAdded
+    | OpenAIResponseObjectStreamResponseOutputItemDone
     | OpenAIResponseObjectStreamResponseOutputTextDelta
+    | OpenAIResponseObjectStreamResponseOutputTextDone
+    | OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta
+    | OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone
+    | OpenAIResponseObjectStreamResponseWebSearchCallInProgress
+    | OpenAIResponseObjectStreamResponseWebSearchCallSearching
+    | OpenAIResponseObjectStreamResponseWebSearchCallCompleted
+    | OpenAIResponseObjectStreamResponseMcpListToolsInProgress
+    | OpenAIResponseObjectStreamResponseMcpListToolsFailed
+    | OpenAIResponseObjectStreamResponseMcpListToolsCompleted
+    | OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta
+    | OpenAIResponseObjectStreamResponseMcpCallArgumentsDone
+    | OpenAIResponseObjectStreamResponseMcpCallInProgress
+    | OpenAIResponseObjectStreamResponseMcpCallFailed
+    | OpenAIResponseObjectStreamResponseMcpCallCompleted
     | OpenAIResponseObjectStreamResponseCompleted,
     Field(discriminator="type"),
 ]
diff --git a/llama_stack/apis/files/files.py b/llama_stack/apis/files/files.py
index 1d762a68a..4dfeed448 100644
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@@ -4,179 +4,158 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Protocol, runtime_checkable
+from enum import Enum
+from typing import Annotated, Literal, Protocol, runtime_checkable
 
+from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel
 
+from llama_stack.apis.common.responses import Order
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 
-@json_schema_type
-class FileUploadResponse(BaseModel):
+# OpenAI Files API Models
+class OpenAIFilePurpose(str, Enum):
+    """
+    Valid purpose values for OpenAI Files API.
     """
-    Response after initiating a file upload session.
 
-    :param id: ID of the upload session
-    :param url: Upload URL for the file or file parts
-    :param offset: Upload content offset
-    :param size: Upload content size
+    ASSISTANTS = "assistants"
+    # TODO: Add other purposes as needed
+
+
+@json_schema_type
+class OpenAIFileObject(BaseModel):
+    """
+    OpenAI File object as defined in the OpenAI Files API.
+
+    :param object: The object type, which is always "file"
+    :param id: The file identifier, which can be referenced in the API endpoints
+    :param bytes: The size of the file, in bytes
+    :param created_at: The Unix timestamp (in seconds) for when the file was created
+    :param expires_at: The Unix timestamp (in seconds) for when the file expires
+    :param filename: The name of the file
+    :param purpose: The intended purpose of the file
+    """
+
+    object: Literal["file"] = "file"
+    id: str
+    bytes: int
+    created_at: int
+    expires_at: int
+    filename: str
+    purpose: OpenAIFilePurpose
+
+
+@json_schema_type
+class ListOpenAIFileResponse(BaseModel):
+    """
+    Response for listing files in OpenAI Files API.
+
+    :param data: List of file objects
+    :param object: The object type, which is always "list"
+    """
+
+    data: list[OpenAIFileObject]
+    has_more: bool
+    first_id: str
+    last_id: str
+    object: Literal["list"] = "list"
+
+
+@json_schema_type
+class OpenAIFileDeleteResponse(BaseModel):
+    """
+    Response for deleting a file in OpenAI Files API.
+
+    :param id: The file identifier that was deleted
+    :param object: The object type, which is always "file"
+    :param deleted: Whether the file was successfully deleted
     """
 
     id: str
-    url: str
-    offset: int
-    size: int
-
-
-@json_schema_type
-class BucketResponse(BaseModel):
-    name: str
-
-
-@json_schema_type
-class ListBucketResponse(BaseModel):
-    """
-    Response representing a list of file entries.
-
-    :param data: List of FileResponse entries
-    """
-
-    data: list[BucketResponse]
-
-
-@json_schema_type
-class FileResponse(BaseModel):
-    """
-    Response representing a file entry.
-
-    :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-    :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-    :param mime_type: MIME type of the file
-    :param url: Upload URL for the file contents
-    :param bytes: Size of the file in bytes
-    :param created_at: Timestamp of when the file was created
-    """
-
-    bucket: str
-    key: str
-    mime_type: str
-    url: str
-    bytes: int
-    created_at: int
-
-
-@json_schema_type
-class ListFileResponse(BaseModel):
-    """
-    Response representing a list of file entries.
-
-    :param data: List of FileResponse entries
-    """
-
-    data: list[FileResponse]
+    object: Literal["file"] = "file"
+    deleted: bool
 
 
 @runtime_checkable
 @trace_protocol
 class Files(Protocol):
-    @webmethod(route="/files", method="POST")
-    async def create_upload_session(
+    # OpenAI Files API Endpoints
+    @webmethod(route="/openai/v1/files", method="POST")
+    async def openai_upload_file(
         self,
-        bucket: str,
-        key: str,
-        mime_type: str,
-        size: int,
-    ) -> FileUploadResponse:
+        file: Annotated[UploadFile, File()],
+        purpose: Annotated[OpenAIFilePurpose, Form()],
+    ) -> OpenAIFileObject:
         """
-        Create a new upload session for a file identified by a bucket and key.
+        Upload a file that can be used across various endpoints.
 
-        :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-).
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
-        :param mime_type: MIME type of the file.
-        :param size: File size in bytes.
-        :returns: A FileUploadResponse.
+        The file upload should be a multipart form request with:
+        - file: The File object (not file name) to be uploaded.
+        - purpose: The intended purpose of the uploaded file.
+
+        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
+        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
+        :returns: An OpenAIFileObject representing the uploaded file.
         """
         ...
 
-    @webmethod(route="/files/session:{upload_id}", method="POST", raw_bytes_request_body=True)
-    async def upload_content_to_session(
+    @webmethod(route="/openai/v1/files", method="GET")
+    async def openai_list_files(
         self,
-        upload_id: str,
-    ) -> FileResponse | None:
+        after: str | None = None,
+        limit: int | None = 10000,
+        order: Order | None = Order.desc,
+        purpose: OpenAIFilePurpose | None = None,
+    ) -> ListOpenAIFileResponse:
         """
-        Upload file content to an existing upload session.
-        On the server, request body will have the raw bytes that are uploaded.
+        Returns a list of files that belong to the user's organization.
 
-        :param upload_id: ID of the upload session.
-        :returns: A FileResponse or None if the upload is not complete.
+        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
+        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 10,000, and the default is 10,000.
+        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
+        :param purpose: Only return files with the given purpose.
+        :returns: An ListOpenAIFileResponse containing the list of files.
         """
         ...
 
-    @webmethod(route="/files/session:{upload_id}", method="GET")
-    async def get_upload_session_info(
+    @webmethod(route="/openai/v1/files/{file_id}", method="GET")
+    async def openai_retrieve_file(
         self,
-        upload_id: str,
-    ) -> FileUploadResponse:
+        file_id: str,
+    ) -> OpenAIFileObject:
         """
-        Returns information about an existsing upload session.
+        Returns information about a specific file.
 
-        :param upload_id: ID of the upload session.
-        :returns: A FileUploadResponse.
+        :param file_id: The ID of the file to use for this request.
+        :returns: An OpenAIFileObject containing file information.
         """
         ...
 
-    @webmethod(route="/files", method="GET")
-    async def list_all_buckets(
+    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE")
+    async def openai_delete_file(
         self,
-        bucket: str,
-    ) -> ListBucketResponse:
+        file_id: str,
+    ) -> OpenAIFileDeleteResponse:
         """
-        List all buckets.
+        Delete a file.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
-        :returns: A ListBucketResponse.
+        :param file_id: The ID of the file to use for this request.
+        :returns: An OpenAIFileDeleteResponse indicating successful deletion.
         """
         ...
 
-    @webmethod(route="/files/{bucket}", method="GET")
-    async def list_files_in_bucket(
+    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET")
+    async def openai_retrieve_file_content(
         self,
-        bucket: str,
-    ) -> ListFileResponse:
+        file_id: str,
+    ) -> Response:
         """
-        List all files in a bucket.
+        Returns the contents of the specified file.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
-        :returns: A ListFileResponse.
-        """
-        ...
-
-    @webmethod(route="/files/{bucket}/{key:path}", method="GET")
-    async def get_file(
-        self,
-        bucket: str,
-        key: str,
-    ) -> FileResponse:
-        """
-        Get a file info identified by a bucket and key.
-
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
-        :returns: A FileResponse.
-        """
-        ...
-
-    @webmethod(route="/files/{bucket}/{key:path}", method="DELETE")
-    async def delete_file(
-        self,
-        bucket: str,
-        key: str,
-    ) -> None:
-        """
-        Delete a file identified by a bucket and key.
-
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
+        :param file_id: The ID of the file to use for this request.
+        :returns: The raw file content as a binary response.
         """
         ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index e79dc6d94..74697dd18 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -783,6 +783,48 @@ class OpenAICompletion(BaseModel):
     object: Literal["text_completion"] = "text_completion"
 
 
+@json_schema_type
+class OpenAIEmbeddingData(BaseModel):
+    """A single embedding data object from an OpenAI-compatible embeddings response.
+
+    :param object: The object type, which will be "embedding"
+    :param embedding: The embedding vector as a list of floats (when encoding_format="float") or as a base64-encoded string (when encoding_format="base64")
+    :param index: The index of the embedding in the input list
+    """
+
+    object: Literal["embedding"] = "embedding"
+    embedding: list[float] | str
+    index: int
+
+
+@json_schema_type
+class OpenAIEmbeddingUsage(BaseModel):
+    """Usage information for an OpenAI-compatible embeddings response.
+
+    :param prompt_tokens: The number of tokens in the input
+    :param total_tokens: The total number of tokens used
+    """
+
+    prompt_tokens: int
+    total_tokens: int
+
+
+@json_schema_type
+class OpenAIEmbeddingsResponse(BaseModel):
+    """Response from an OpenAI-compatible embeddings request.
+
+    :param object: The object type, which will be "list"
+    :param data: List of embedding data objects
+    :param model: The model that was used to generate the embeddings
+    :param usage: Usage information
+    """
+
+    object: Literal["list"] = "list"
+    data: list[OpenAIEmbeddingData]
+    model: str
+    usage: OpenAIEmbeddingUsage
+
+
 class ModelStore(Protocol):
     async def get_model(self, identifier: str) -> Model: ...
 
@@ -1076,6 +1118,26 @@ class InferenceProvider(Protocol):
         """
         ...
 
+    @webmethod(route="/openai/v1/embeddings", method="POST")
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        """Generate OpenAI-compatible embeddings for the given input using the specified model.
+
+        :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+        :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
+        :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
+        :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
+        :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
+        :returns: An OpenAIEmbeddingsResponse containing the embeddings.
+        """
+        ...
+
 
 class Inference(InferenceProvider):
     """Llama Stack Inference API for generating completions, chat completions, and embeddings.
diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py
index 3ac62d42c..44cc8f904 100644
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@@ -19,8 +19,16 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 
 class Chunk(BaseModel):
+    """
+    A chunk of content that can be inserted into a vector database.
+    :param content: The content of the chunk, which can be interleaved text, images, or other types.
+    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
+    :param metadata: Metadata associated with the chunk, such as document ID, source, or other relevant information.
+    """
+
     content: InterleavedContent
     metadata: dict[str, Any] = Field(default_factory=dict)
+    embedding: list[float] | None = None
 
 
 @json_schema_type
@@ -50,7 +58,10 @@ class VectorIO(Protocol):
         """Insert chunks into a vector database.
 
         :param vector_db_id: The identifier of the vector database to insert the chunks into.
-        :param chunks: The chunks to insert.
+        :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
+            `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
+            If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
+            If `embedding` is not provided, it will be computed later.
         :param ttl_seconds: The time to live of the chunks.
         """
         ...
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index 27745edac..2f768957d 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -35,7 +35,8 @@ class StackRun(Subcommand):
             "config",
             type=str,
             nargs="?",  # Make it optional
-            help="Path to config file to use for the run. Required for venv and conda environments.",
+            metavar="config | template",
+            help="Path to config file to use for the run or name of known template (`llama stack list` for a list).",
         )
         self.parser.add_argument(
             "--port",
@@ -59,7 +60,7 @@ class StackRun(Subcommand):
             "--image-type",
             type=str,
             help="Image Type used during the build. This can be either conda or container or venv.",
-            choices=[e.value for e in ImageType],
+            choices=[e.value for e in ImageType if e.value != ImageType.CONTAINER.value],
         )
         self.parser.add_argument(
             "--enable-ui",
@@ -154,7 +155,10 @@ class StackRun(Subcommand):
                 # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
                 if callable(getattr(args, arg)):
                     continue
-                setattr(server_args, arg, getattr(args, arg))
+                if arg == "config" and template_name:
+                    server_args.config = str(config_file)
+                else:
+                    setattr(server_args, arg, getattr(args, arg))
 
             # Run the server
             server_main(server_args)
diff --git a/llama_stack/distribution/access_control.py b/llama_stack/distribution/access_control.py
deleted file mode 100644
index d560ec80f..000000000
--- a/llama_stack/distribution/access_control.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.distribution.datatypes import AccessAttributes
-from llama_stack.log import get_logger
-
-logger = get_logger(__name__, category="core")
-
-
-def check_access(
-    obj_identifier: str,
-    obj_attributes: AccessAttributes | None,
-    user_attributes: dict[str, Any] | None = None,
-) -> bool:
-    """Check if the current user has access to the given object, based on access attributes.
-
-    Access control algorithm:
-    1. If the resource has no access_attributes, access is GRANTED to all authenticated users
-    2. If the user has no attributes, access is DENIED to any object with access_attributes defined
-    3. For each attribute category in the resource's access_attributes:
-       a. If the user lacks that category, access is DENIED
-       b. If the user has the category but none of the required values, access is DENIED
-       c. If the user has at least one matching value in each required category, access is GRANTED
-
-    Example:
-        # Resource requires:
-        access_attributes = AccessAttributes(
-            roles=["admin", "data-scientist"],
-            teams=["ml-team"]
-        )
-
-        # User has:
-        user_attributes = {
-            "roles": ["data-scientist", "engineer"],
-            "teams": ["ml-team", "infra-team"],
-            "projects": ["llama-3"]
-        }
-
-        # Result: Access GRANTED
-        # - User has the "data-scientist" role (matches one of the required roles)
-        # - AND user is part of the "ml-team" (matches the required team)
-        # - The extra "projects" attribute is ignored
-
-    Args:
-        obj_identifier: The identifier of the resource object to check access for
-        obj_attributes: The access attributes of the resource object
-        user_attributes: The attributes of the current user
-
-    Returns:
-        bool: True if access is granted, False if denied
-    """
-    # If object has no access attributes, allow access by default
-    if not obj_attributes:
-        return True
-
-    # If no user attributes, deny access to objects with access control
-    if not user_attributes:
-        return False
-
-    dict_attribs = obj_attributes.model_dump(exclude_none=True)
-    if not dict_attribs:
-        return True
-
-    # Check each attribute category (requires ALL categories to match)
-    # TODO: formalize this into a proper ABAC policy
-    for attr_key, required_values in dict_attribs.items():
-        user_values = user_attributes.get(attr_key, [])
-
-        if not user_values:
-            logger.debug(f"Access denied to {obj_identifier}: missing required attribute category '{attr_key}'")
-            return False
-
-        if not any(val in user_values for val in required_values):
-            logger.debug(
-                f"Access denied to {obj_identifier}: "
-                f"no match for attribute '{attr_key}', required one of {required_values}"
-            )
-            return False
-
-    logger.debug(f"Access granted to {obj_identifier}")
-    return True
diff --git a/llama_stack/distribution/access_control/__init__.py b/llama_stack/distribution/access_control/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/distribution/access_control/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/distribution/access_control/access_control.py b/llama_stack/distribution/access_control/access_control.py
new file mode 100644
index 000000000..84d506d8f
--- /dev/null
+++ b/llama_stack/distribution/access_control/access_control.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.distribution.datatypes import User
+
+from .conditions import (
+    Condition,
+    ProtectedResource,
+    parse_conditions,
+)
+from .datatypes import (
+    AccessRule,
+    Action,
+    Scope,
+)
+
+
+def matches_resource(resource_scope: str, actual_resource: str) -> bool:
+    if resource_scope == actual_resource:
+        return True
+    return resource_scope.endswith("::*") and actual_resource.startswith(resource_scope[:-1])
+
+
+def matches_scope(
+    scope: Scope,
+    action: Action,
+    resource: str,
+    user: str | None,
+) -> bool:
+    if scope.resource and not matches_resource(scope.resource, resource):
+        return False
+    if scope.principal and scope.principal != user:
+        return False
+    return action in scope.actions
+
+
+def as_list(obj: Any) -> list[Any]:
+    if isinstance(obj, list):
+        return obj
+    return [obj]
+
+
+def matches_conditions(
+    conditions: list[Condition],
+    resource: ProtectedResource,
+    user: User,
+) -> bool:
+    for condition in conditions:
+        # must match all conditions
+        if not condition.matches(resource, user):
+            return False
+    return True
+
+
+def default_policy() -> list[AccessRule]:
+    # for backwards compatibility, if no rules are provided, assume
+    # full access subject to previous attribute matching rules
+    return [
+        AccessRule(
+            permit=Scope(actions=list(Action)),
+            when=["user in owners " + name for name in ["roles", "teams", "projects", "namespaces"]],
+        ),
+    ]
+
+
+def is_action_allowed(
+    policy: list[AccessRule],
+    action: Action,
+    resource: ProtectedResource,
+    user: User | None,
+) -> bool:
+    # If user is not set, assume authentication is not enabled
+    if not user:
+        return True
+
+    if not len(policy):
+        policy = default_policy()
+
+    qualified_resource_id = resource.type + "::" + resource.identifier
+    for rule in policy:
+        if rule.forbid and matches_scope(rule.forbid, action, qualified_resource_id, user.principal):
+            if rule.when:
+                if matches_conditions(parse_conditions(as_list(rule.when)), resource, user):
+                    return False
+            elif rule.unless:
+                if not matches_conditions(parse_conditions(as_list(rule.unless)), resource, user):
+                    return False
+            else:
+                return False
+        elif rule.permit and matches_scope(rule.permit, action, qualified_resource_id, user.principal):
+            if rule.when:
+                if matches_conditions(parse_conditions(as_list(rule.when)), resource, user):
+                    return True
+            elif rule.unless:
+                if not matches_conditions(parse_conditions(as_list(rule.unless)), resource, user):
+                    return True
+            else:
+                return True
+    # assume access is denied unless we find a rule that permits access
+    return False
+
+
+class AccessDeniedError(RuntimeError):
+    pass
diff --git a/llama_stack/distribution/access_control/conditions.py b/llama_stack/distribution/access_control/conditions.py
new file mode 100644
index 000000000..25a267124
--- /dev/null
+++ b/llama_stack/distribution/access_control/conditions.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Protocol
+
+
+class User(Protocol):
+    principal: str
+    attributes: dict[str, list[str]] | None
+
+
+class ProtectedResource(Protocol):
+    type: str
+    identifier: str
+    owner: User
+
+
+class Condition(Protocol):
+    def matches(self, resource: ProtectedResource, user: User) -> bool: ...
+
+
+class UserInOwnersList:
+    def __init__(self, name: str):
+        self.name = name
+
+    def owners_values(self, resource: ProtectedResource) -> list[str] | None:
+        if (
+            hasattr(resource, "owner")
+            and resource.owner
+            and resource.owner.attributes
+            and self.name in resource.owner.attributes
+        ):
+            return resource.owner.attributes[self.name]
+        else:
+            return None
+
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        required = self.owners_values(resource)
+        if not required:
+            return True
+        if not user.attributes or self.name not in user.attributes or not user.attributes[self.name]:
+            return False
+        user_values = user.attributes[self.name]
+        for value in required:
+            if value in user_values:
+                return True
+        return False
+
+    def __repr__(self):
+        return f"user in owners {self.name}"
+
+
+class UserNotInOwnersList(UserInOwnersList):
+    def __init__(self, name: str):
+        super().__init__(name)
+
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        return not super().matches(resource, user)
+
+    def __repr__(self):
+        return f"user not in owners {self.name}"
+
+
+class UserWithValueInList:
+    def __init__(self, name: str, value: str):
+        self.name = name
+        self.value = value
+
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        if user.attributes and self.name in user.attributes:
+            return self.value in user.attributes[self.name]
+        print(f"User does not have {self.value} in {self.name}")
+        return False
+
+    def __repr__(self):
+        return f"user with {self.value} in {self.name}"
+
+
+class UserWithValueNotInList(UserWithValueInList):
+    def __init__(self, name: str, value: str):
+        super().__init__(name, value)
+
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        return not super().matches(resource, user)
+
+    def __repr__(self):
+        return f"user with {self.value} not in {self.name}"
+
+
+class UserIsOwner:
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        return resource.owner.principal == user.principal if resource.owner else False
+
+    def __repr__(self):
+        return "user is owner"
+
+
+class UserIsNotOwner:
+    def matches(self, resource: ProtectedResource, user: User) -> bool:
+        return not resource.owner or resource.owner.principal != user.principal
+
+    def __repr__(self):
+        return "user is not owner"
+
+
+def parse_condition(condition: str) -> Condition:
+    words = condition.split()
+    match words:
+        case ["user", "is", "owner"]:
+            return UserIsOwner()
+        case ["user", "is", "not", "owner"]:
+            return UserIsNotOwner()
+        case ["user", "with", value, "in", name]:
+            return UserWithValueInList(name, value)
+        case ["user", "with", value, "not", "in", name]:
+            return UserWithValueNotInList(name, value)
+        case ["user", "in", "owners", name]:
+            return UserInOwnersList(name)
+        case ["user", "not", "in", "owners", name]:
+            return UserNotInOwnersList(name)
+        case _:
+            raise ValueError(f"Invalid condition: {condition}")
+
+
+def parse_conditions(conditions: list[str]) -> list[Condition]:
+    return [parse_condition(c) for c in conditions]
diff --git a/llama_stack/distribution/access_control/datatypes.py b/llama_stack/distribution/access_control/datatypes.py
new file mode 100644
index 000000000..3e6c624dc
--- /dev/null
+++ b/llama_stack/distribution/access_control/datatypes.py
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+
+from pydantic import BaseModel, model_validator
+from typing_extensions import Self
+
+from .conditions import parse_conditions
+
+
+class Action(str, Enum):
+    CREATE = "create"
+    READ = "read"
+    UPDATE = "update"
+    DELETE = "delete"
+
+
+class Scope(BaseModel):
+    principal: str | None = None
+    actions: Action | list[Action]
+    resource: str | None = None
+
+
+def _mutually_exclusive(obj, a: str, b: str):
+    if getattr(obj, a) and getattr(obj, b):
+        raise ValueError(f"{a} and {b} are mutually exclusive")
+
+
+def _require_one_of(obj, a: str, b: str):
+    if not getattr(obj, a) and not getattr(obj, b):
+        raise ValueError(f"on of {a} or {b} is required")
+
+
+class AccessRule(BaseModel):
+    """Access rule based loosely on cedar policy language
+
+    A rule defines a list of action either to permit or to forbid. It may specify a
+    principal or a resource that must match for the rule to take effect. The resource
+    to match should be specified in the form of a type qualified identifier, e.g.
+    model::my-model or vector_db::some-db, or a wildcard for all resources of a type,
+    e.g. model::*. If the principal or resource are not specified, they will match all
+    requests.
+
+    A rule may also specify a condition, either a 'when' or an 'unless', with additional
+    constraints as to where the rule applies. The constraints supported at present are:
+
+    - 'user with <attr-value> in <attr-name>'
+    - 'user with <attr-value> not in <attr-name>'
+    - 'user is owner'
+    - 'user is not owner'
+    - 'user in owners <attr-name>'
+    - 'user not in owners <attr-name>'
+
+    Rules are tested in order to find a match. If a match is found, the request is
+    permitted or forbidden depending on the type of rule. If no match is found, the
+    request is denied. If no rules are specified, a rule that allows any action as
+    long as the resource attributes match the user attributes is added
+    (i.e. the previous behaviour is the default).
+
+    Some examples in yaml:
+
+    - permit:
+        principal: user-1
+        actions: [create, read, delete]
+        resource: model::*
+      description: user-1 has full access to all models
+    - permit:
+        principal: user-2
+        actions: [read]
+        resource: model::model-1
+      description: user-2 has read access to model-1 only
+    - permit:
+        actions: [read]
+      when: user in owner teams
+      description: any user has read access to any resource created by a member of their team
+    - forbid:
+        actions: [create, read, delete]
+        resource: vector_db::*
+      unless: user with admin in roles
+      description: only user with admin role can use vector_db resources
+
+    """
+
+    permit: Scope | None = None
+    forbid: Scope | None = None
+    when: str | list[str] | None = None
+    unless: str | list[str] | None = None
+    description: str | None = None
+
+    @model_validator(mode="after")
+    def validate_rule_format(self) -> Self:
+        _require_one_of(self, "permit", "forbid")
+        _mutually_exclusive(self, "permit", "forbid")
+        _mutually_exclusive(self, "when", "unless")
+        if isinstance(self.when, list):
+            parse_conditions(self.when)
+        elif self.when:
+            parse_conditions([self.when])
+        if isinstance(self.unless, list):
+            parse_conditions(self.unless)
+        elif self.unless:
+            parse_conditions([self.unless])
+        return self
diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py
index 072f9c425..4f9091a5d 100644
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@@ -29,6 +29,8 @@ SERVER_DEPENDENCIES = [
     "fire",
     "httpx",
     "uvicorn",
+    "opentelemetry-sdk",
+    "opentelemetry-exporter-otlp-proto-http",
 ]
 
 
@@ -41,23 +43,12 @@ def get_provider_dependencies(
     config: BuildConfig | DistributionTemplate,
 ) -> tuple[list[str], list[str]]:
     """Get normal and special dependencies from provider configuration."""
-    # Extract providers based on config type
     if isinstance(config, DistributionTemplate):
-        providers = config.providers
+        config = config.build_config()
+
+    providers = config.distribution_spec.providers
+    additional_pip_packages = config.additional_pip_packages
 
-        # TODO: This is a hack to get the dependencies for internal APIs into build
-        # We should have a better way to do this by formalizing the concept of "internal" APIs
-        # and providers, with a way to specify dependencies for them.
-        run_configs = config.run_configs
-        additional_pip_packages: list[str] = []
-        if run_configs:
-            for run_config in run_configs.values():
-                run_config_ = run_config.run_config(name="", providers={}, container_image=None)
-                if run_config_.inference_store:
-                    additional_pip_packages.extend(run_config_.inference_store.pip_packages)
-    elif isinstance(config, BuildConfig):
-        providers = config.distribution_spec.providers
-        additional_pip_packages = config.additional_pip_packages
     deps = []
     registry = get_provider_registry(config)
     for api_str, provider_or_providers in providers.items():
@@ -85,8 +76,7 @@ def get_provider_dependencies(
         else:
             normal_deps.append(package)
 
-    if additional_pip_packages:
-        normal_deps.extend(additional_pip_packages)
+    normal_deps.extend(additional_pip_packages or [])
 
     return list(set(normal_deps)), list(set(special_deps))
 
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index def7048c0..abc3f0065 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -24,6 +24,7 @@ from llama_stack.apis.shields import Shield, ShieldInput
 from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.distribution.access_control.datatypes import AccessRule
 from llama_stack.providers.datatypes import Api, ProviderSpec
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import SqlStoreConfig
@@ -35,126 +36,66 @@ LLAMA_STACK_RUN_CONFIG_VERSION = "2"
 RoutingKey = str | list[str]
 
 
-class AccessAttributes(BaseModel):
-    """Structured representation of user attributes for access control.
+class User(BaseModel):
+    principal: str
+    # further attributes that may be used for access control decisions
+    attributes: dict[str, list[str]] | None = None
 
-    This model defines a structured approach to representing user attributes
-    with common standard categories for access control.
-
-    Standard attribute categories include:
-    - roles: Role-based attributes (e.g., admin, data-scientist)
-    - teams: Team-based attributes (e.g., ml-team, infra-team)
-    - projects: Project access attributes (e.g., llama-3, customer-insights)
-    - namespaces: Namespace-based access control for resource isolation
-    """
-
-    # Standard attribute categories - the minimal set we need now
-    roles: list[str] | None = Field(
-        default=None, description="Role-based attributes (e.g., 'admin', 'data-scientist', 'user')"
-    )
-
-    teams: list[str] | None = Field(default=None, description="Team-based attributes (e.g., 'ml-team', 'nlp-team')")
-
-    projects: list[str] | None = Field(
-        default=None, description="Project-based access attributes (e.g., 'llama-3', 'customer-insights')"
-    )
-
-    namespaces: list[str] | None = Field(
-        default=None, description="Namespace-based access control for resource isolation"
-    )
+    def __init__(self, principal: str, attributes: dict[str, list[str]] | None):
+        super().__init__(principal=principal, attributes=attributes)
 
 
-class ResourceWithACL(Resource):
-    """Extension of Resource that adds attribute-based access control capabilities.
+class ResourceWithOwner(Resource):
+    """Extension of Resource that adds an optional owner, i.e. the user that created the
+    resource. This can be used to constrain access to the resource."""
 
-    This class adds an optional access_attributes field that allows fine-grained control
-    over which users can access each resource. When attributes are defined, a user must have
-    matching attributes to access the resource.
-
-    Attribute Matching Algorithm:
-    1. If a resource has no access_attributes (None or empty dict), it's visible to all authenticated users
-    2. Each key in access_attributes represents an attribute category (e.g., "roles", "teams", "projects")
-    3. The matching algorithm requires ALL categories to match (AND relationship between categories)
-    4. Within each category, ANY value match is sufficient (OR relationship within a category)
-
-    Examples:
-        # Resource visible to everyone (no access control)
-        model = Model(identifier="llama-2", ...)
-
-        # Resource visible only to admins
-        model = Model(
-            identifier="gpt-4",
-            access_attributes=AccessAttributes(roles=["admin"])
-        )
-
-        # Resource visible to data scientists on the ML team
-        model = Model(
-            identifier="private-model",
-            access_attributes=AccessAttributes(
-                roles=["data-scientist", "researcher"],
-                teams=["ml-team"]
-            )
-        )
-        # ^ User must have at least one of the roles AND be on the ml-team
-
-        # Resource visible to users with specific project access
-        vector_db = VectorDB(
-            identifier="customer-embeddings",
-            access_attributes=AccessAttributes(
-                projects=["customer-insights"],
-                namespaces=["confidential"]
-            )
-        )
-        # ^ User must have access to the customer-insights project AND have confidential namespace
-    """
-
-    access_attributes: AccessAttributes | None = None
+    owner: User | None = None
 
 
 # Use the extended Resource for all routable objects
-class ModelWithACL(Model, ResourceWithACL):
+class ModelWithOwner(Model, ResourceWithOwner):
     pass
 
 
-class ShieldWithACL(Shield, ResourceWithACL):
+class ShieldWithOwner(Shield, ResourceWithOwner):
     pass
 
 
-class VectorDBWithACL(VectorDB, ResourceWithACL):
+class VectorDBWithOwner(VectorDB, ResourceWithOwner):
     pass
 
 
-class DatasetWithACL(Dataset, ResourceWithACL):
+class DatasetWithOwner(Dataset, ResourceWithOwner):
     pass
 
 
-class ScoringFnWithACL(ScoringFn, ResourceWithACL):
+class ScoringFnWithOwner(ScoringFn, ResourceWithOwner):
     pass
 
 
-class BenchmarkWithACL(Benchmark, ResourceWithACL):
+class BenchmarkWithOwner(Benchmark, ResourceWithOwner):
     pass
 
 
-class ToolWithACL(Tool, ResourceWithACL):
+class ToolWithOwner(Tool, ResourceWithOwner):
     pass
 
 
-class ToolGroupWithACL(ToolGroup, ResourceWithACL):
+class ToolGroupWithOwner(ToolGroup, ResourceWithOwner):
     pass
 
 
 RoutableObject = Model | Shield | VectorDB | Dataset | ScoringFn | Benchmark | Tool | ToolGroup
 
 RoutableObjectWithProvider = Annotated[
-    ModelWithACL
-    | ShieldWithACL
-    | VectorDBWithACL
-    | DatasetWithACL
-    | ScoringFnWithACL
-    | BenchmarkWithACL
-    | ToolWithACL
-    | ToolGroupWithACL,
+    ModelWithOwner
+    | ShieldWithOwner
+    | VectorDBWithOwner
+    | DatasetWithOwner
+    | ScoringFnWithOwner
+    | BenchmarkWithOwner
+    | ToolWithOwner
+    | ToolGroupWithOwner,
     Field(discriminator="type"),
 ]
 
@@ -234,6 +175,7 @@ class AuthenticationConfig(BaseModel):
         ...,
         description="Provider-specific configuration",
     )
+    access_policy: list[AccessRule] = Field(default=[], description="Rules for determining access to resources")
 
 
 class AuthenticationRequiredError(Exception):
diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index f32130cf9..cebfabba5 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -149,12 +149,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
             logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
 
     def request(self, *args, **kwargs):
+        # NOTE: We are using AsyncLlamaStackClient under the hood
+        # A new event loop is needed to convert the AsyncStream
+        # from async client into SyncStream return type for streaming
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
         if kwargs.get("stream"):
-            # NOTE: We are using AsyncLlamaStackClient under the hood
-            # A new event loop is needed to convert the AsyncStream
-            # from async client into SyncStream return type for streaming
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
 
             def sync_generator():
                 try:
@@ -172,7 +173,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
 
             return sync_generator()
         else:
-            return asyncio.run(self.async_client.request(*args, **kwargs))
+            try:
+                result = loop.run_until_complete(self.async_client.request(*args, **kwargs))
+            finally:
+                pending = asyncio.all_tasks(loop)
+                if pending:
+                    loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
+                loop.close()
+            return result
 
 
 class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py
index b03d2dee8..81d494e04 100644
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@@ -10,6 +10,8 @@ import logging
 from contextlib import AbstractContextManager
 from typing import Any
 
+from llama_stack.distribution.datatypes import User
+
 from .utils.dynamic import instantiate_class_type
 
 log = logging.getLogger(__name__)
@@ -21,12 +23,10 @@ PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
 class RequestProviderDataContext(AbstractContextManager):
     """Context manager for request provider data"""
 
-    def __init__(
-        self, provider_data: dict[str, Any] | None = None, auth_attributes: dict[str, list[str]] | None = None
-    ):
+    def __init__(self, provider_data: dict[str, Any] | None = None, user: User | None = None):
         self.provider_data = provider_data or {}
-        if auth_attributes:
-            self.provider_data["__auth_attributes"] = auth_attributes
+        if user:
+            self.provider_data["__authenticated_user"] = user
 
         self.token = None
 
@@ -95,9 +95,9 @@ def request_provider_data_context(
     return RequestProviderDataContext(provider_data, auth_attributes)
 
 
-def get_auth_attributes() -> dict[str, list[str]] | None:
+def get_authenticated_user() -> User | None:
     """Helper to retrieve auth attributes from the provider data context"""
     provider_data = PROVIDER_DATA_VAR.get()
     if not provider_data:
         return None
-    return provider_data.get("__auth_attributes")
+    return provider_data.get("__authenticated_user")
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index b7c7cb87f..6e7bb5edd 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -28,6 +28,7 @@ from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.distribution.client import get_client_impl
 from llama_stack.distribution.datatypes import (
+    AccessRule,
     AutoRoutedProviderSpec,
     Provider,
     RoutingTableProviderSpec,
@@ -118,6 +119,7 @@ async def resolve_impls(
     run_config: StackRunConfig,
     provider_registry: ProviderRegistry,
     dist_registry: DistributionRegistry,
+    policy: list[AccessRule],
 ) -> dict[Api, Any]:
     """
     Resolves provider implementations by:
@@ -140,7 +142,7 @@ async def resolve_impls(
 
     sorted_providers = sort_providers_by_deps(providers_with_specs, run_config)
 
-    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config)
+    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config, policy)
 
 
 def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str, dict[str, ProviderWithSpec]]:
@@ -247,6 +249,7 @@ async def instantiate_providers(
     router_apis: set[Api],
     dist_registry: DistributionRegistry,
     run_config: StackRunConfig,
+    policy: list[AccessRule],
 ) -> dict:
     """Instantiates providers asynchronously while managing dependencies."""
     impls: dict[Api, Any] = {}
@@ -261,7 +264,7 @@ async def instantiate_providers(
         if isinstance(provider.spec, RoutingTableProviderSpec):
             inner_impls = inner_impls_by_provider_id[f"inner-{provider.spec.router_api.value}"]
 
-        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry, run_config)
+        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry, run_config, policy)
 
         if api_str.startswith("inner-"):
             inner_impls_by_provider_id[api_str][provider.provider_id] = impl
@@ -312,6 +315,7 @@ async def instantiate_provider(
     inner_impls: dict[str, Any],
     dist_registry: DistributionRegistry,
     run_config: StackRunConfig,
+    policy: list[AccessRule],
 ):
     provider_spec = provider.spec
     if not hasattr(provider_spec, "module"):
@@ -336,13 +340,15 @@ async def instantiate_provider(
         method = "get_routing_table_impl"
 
         config = None
-        args = [provider_spec.api, inner_impls, deps, dist_registry]
+        args = [provider_spec.api, inner_impls, deps, dist_registry, policy]
     else:
         method = "get_provider_impl"
 
         config_type = instantiate_class_type(provider_spec.config_class)
         config = config_type(**provider.config)
         args = [config, deps]
+        if "policy" in inspect.signature(getattr(module, method)).parameters:
+            args.append(policy)
 
     fn = getattr(module, method)
     impl = await fn(*args)
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index 1358d5812..0a0c13880 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -6,7 +6,7 @@
 
 from typing import Any
 
-from llama_stack.distribution.datatypes import RoutedProtocol
+from llama_stack.distribution.datatypes import AccessRule, RoutedProtocol
 from llama_stack.distribution.stack import StackRunConfig
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
@@ -18,6 +18,7 @@ async def get_routing_table_impl(
     impls_by_provider_id: dict[str, RoutedProtocol],
     _deps,
     dist_registry: DistributionRegistry,
+    policy: list[AccessRule],
 ) -> Any:
     from ..routing_tables.benchmarks import BenchmarksRoutingTable
     from ..routing_tables.datasets import DatasetsRoutingTable
@@ -40,7 +41,7 @@ async def get_routing_table_impl(
     if api.value not in api_to_tables:
         raise ValueError(f"API {api.value} not found in router map")
 
-    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry)
+    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
     await impl.initialize()
     return impl
 
diff --git a/llama_stack/distribution/routers/inference.py b/llama_stack/distribution/routers/inference.py
index f77b19302..763bd9105 100644
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@@ -45,6 +45,7 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingsResponse,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -546,6 +547,34 @@ class InferenceRouter(Inference):
                 await self.store.store_chat_completion(response, messages)
             return response
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        logger.debug(
+            f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
+        )
+        model_obj = await self.routing_table.get_model(model)
+        if model_obj is None:
+            raise ValueError(f"Model '{model}' not found")
+        if model_obj.model_type != ModelType.embedding:
+            raise ValueError(f"Model '{model}' is not an embedding model")
+
+        params = dict(
+            model=model_obj.identifier,
+            input=input,
+            encoding_format=encoding_format,
+            dimensions=dimensions,
+            user=user,
+        )
+
+        provider = self.routing_table.get_provider_impl(model_obj.identifier)
+        return await provider.openai_embeddings(**params)
+
     async def list_chat_completions(
         self,
         after: str | None = None,
diff --git a/llama_stack/distribution/routing_tables/benchmarks.py b/llama_stack/distribution/routing_tables/benchmarks.py
index 589a00c02..815483494 100644
--- a/llama_stack/distribution/routing_tables/benchmarks.py
+++ b/llama_stack/distribution/routing_tables/benchmarks.py
@@ -8,7 +8,7 @@ from typing import Any
 
 from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
 from llama_stack.distribution.datatypes import (
-    BenchmarkWithACL,
+    BenchmarkWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -47,7 +47,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
                 )
         if provider_benchmark_id is None:
             provider_benchmark_id = benchmark_id
-        benchmark = BenchmarkWithACL(
+        benchmark = BenchmarkWithOwner(
             identifier=benchmark_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
diff --git a/llama_stack/distribution/routing_tables/common.py b/llama_stack/distribution/routing_tables/common.py
index 8ec87ca50..b79c8a2a8 100644
--- a/llama_stack/distribution/routing_tables/common.py
+++ b/llama_stack/distribution/routing_tables/common.py
@@ -8,14 +8,14 @@ from typing import Any
 
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import ScoringFn
-from llama_stack.distribution.access_control import check_access
+from llama_stack.distribution.access_control.access_control import AccessDeniedError, is_action_allowed
 from llama_stack.distribution.datatypes import (
-    AccessAttributes,
+    AccessRule,
     RoutableObject,
     RoutableObjectWithProvider,
     RoutedProtocol,
 )
-from llama_stack.distribution.request_headers import get_auth_attributes
+from llama_stack.distribution.request_headers import get_authenticated_user
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, RoutingTable
@@ -73,9 +73,11 @@ class CommonRoutingTableImpl(RoutingTable):
         self,
         impls_by_provider_id: dict[str, RoutedProtocol],
         dist_registry: DistributionRegistry,
+        policy: list[AccessRule],
     ) -> None:
         self.impls_by_provider_id = impls_by_provider_id
         self.dist_registry = dist_registry
+        self.policy = policy
 
     async def initialize(self) -> None:
         async def add_objects(objs: list[RoutableObjectWithProvider], provider_id: str, cls) -> None:
@@ -166,13 +168,15 @@ class CommonRoutingTableImpl(RoutingTable):
             return None
 
         # Check if user has permission to access this object
-        if not check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes()):
-            logger.debug(f"Access denied to {type} '{identifier}' based on attribute mismatch")
+        if not is_action_allowed(self.policy, "read", obj, get_authenticated_user()):
+            logger.debug(f"Access denied to {type} '{identifier}'")
             return None
 
         return obj
 
     async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
+        if not is_action_allowed(self.policy, "delete", obj, get_authenticated_user()):
+            raise AccessDeniedError()
         await self.dist_registry.delete(obj.type, obj.identifier)
         await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
 
@@ -187,11 +191,12 @@ class CommonRoutingTableImpl(RoutingTable):
         p = self.impls_by_provider_id[obj.provider_id]
 
         # If object supports access control but no attributes set, use creator's attributes
-        if not obj.access_attributes:
-            creator_attributes = get_auth_attributes()
-            if creator_attributes:
-                obj.access_attributes = AccessAttributes(**creator_attributes)
-                logger.info(f"Setting access attributes for {obj.type} '{obj.identifier}' based on creator's identity")
+        creator = get_authenticated_user()
+        if not is_action_allowed(self.policy, "create", obj, creator):
+            raise AccessDeniedError()
+        if creator:
+            obj.owner = creator
+            logger.info(f"Setting owner for {obj.type} '{obj.identifier}' to {obj.owner.principal}")
 
         registered_obj = await register_object_with_provider(obj, p)
         # TODO: This needs to be fixed for all APIs once they return the registered object
@@ -210,9 +215,7 @@ class CommonRoutingTableImpl(RoutingTable):
         # Apply attribute-based access control filtering
         if filtered_objs:
             filtered_objs = [
-                obj
-                for obj in filtered_objs
-                if check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes())
+                obj for obj in filtered_objs if is_action_allowed(self.policy, "read", obj, get_authenticated_user())
             ]
 
         return filtered_objs
diff --git a/llama_stack/distribution/routing_tables/datasets.py b/llama_stack/distribution/routing_tables/datasets.py
index 4401ad47e..fb34f40b6 100644
--- a/llama_stack/distribution/routing_tables/datasets.py
+++ b/llama_stack/distribution/routing_tables/datasets.py
@@ -19,7 +19,7 @@ from llama_stack.apis.datasets import (
 )
 from llama_stack.apis.resource import ResourceType
 from llama_stack.distribution.datatypes import (
-    DatasetWithACL,
+    DatasetWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -74,7 +74,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
         if metadata is None:
             metadata = {}
 
-        dataset = DatasetWithACL(
+        dataset = DatasetWithOwner(
             identifier=dataset_id,
             provider_resource_id=provider_dataset_id,
             provider_id=provider_id,
diff --git a/llama_stack/distribution/routing_tables/models.py b/llama_stack/distribution/routing_tables/models.py
index 7216d9935..c6a10ea9b 100644
--- a/llama_stack/distribution/routing_tables/models.py
+++ b/llama_stack/distribution/routing_tables/models.py
@@ -9,7 +9,7 @@ from typing import Any
 
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
 from llama_stack.distribution.datatypes import (
-    ModelWithACL,
+    ModelWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -65,7 +65,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
             model_type = ModelType.llm
         if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
             raise ValueError("Embedding model must have an embedding dimension in its metadata")
-        model = ModelWithACL(
+        model = ModelWithOwner(
             identifier=model_id,
             provider_resource_id=provider_model_id,
             provider_id=provider_id,
diff --git a/llama_stack/distribution/routing_tables/scoring_functions.py b/llama_stack/distribution/routing_tables/scoring_functions.py
index d85f64b57..742cc3ca6 100644
--- a/llama_stack/distribution/routing_tables/scoring_functions.py
+++ b/llama_stack/distribution/routing_tables/scoring_functions.py
@@ -13,7 +13,7 @@ from llama_stack.apis.scoring_functions import (
     ScoringFunctions,
 )
 from llama_stack.distribution.datatypes import (
-    ScoringFnWithACL,
+    ScoringFnWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -50,7 +50,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
                 raise ValueError(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
-        scoring_fn = ScoringFnWithACL(
+        scoring_fn = ScoringFnWithOwner(
             identifier=scoring_fn_id,
             description=description,
             return_type=return_type,
diff --git a/llama_stack/distribution/routing_tables/shields.py b/llama_stack/distribution/routing_tables/shields.py
index 7f62596c9..5215981b9 100644
--- a/llama_stack/distribution/routing_tables/shields.py
+++ b/llama_stack/distribution/routing_tables/shields.py
@@ -9,7 +9,7 @@ from typing import Any
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
 from llama_stack.distribution.datatypes import (
-    ShieldWithACL,
+    ShieldWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -47,7 +47,7 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
                 )
         if params is None:
             params = {}
-        shield = ShieldWithACL(
+        shield = ShieldWithOwner(
             identifier=shield_id,
             provider_resource_id=provider_shield_id,
             provider_id=provider_id,
diff --git a/llama_stack/distribution/routing_tables/toolgroups.py b/llama_stack/distribution/routing_tables/toolgroups.py
index 2f7dc3e06..b86f057bd 100644
--- a/llama_stack/distribution/routing_tables/toolgroups.py
+++ b/llama_stack/distribution/routing_tables/toolgroups.py
@@ -8,7 +8,7 @@ from typing import Any
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups
-from llama_stack.distribution.datatypes import ToolGroupWithACL
+from llama_stack.distribution.datatypes import ToolGroupWithOwner
 from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
@@ -106,7 +106,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
         mcp_endpoint: URL | None = None,
         args: dict[str, Any] | None = None,
     ) -> None:
-        toolgroup = ToolGroupWithACL(
+        toolgroup = ToolGroupWithOwner(
             identifier=toolgroup_id,
             provider_id=provider_id,
             provider_resource_id=toolgroup_id,
diff --git a/llama_stack/distribution/routing_tables/vector_dbs.py b/llama_stack/distribution/routing_tables/vector_dbs.py
index dc6c0d0ef..542e965f8 100644
--- a/llama_stack/distribution/routing_tables/vector_dbs.py
+++ b/llama_stack/distribution/routing_tables/vector_dbs.py
@@ -10,7 +10,7 @@ from llama_stack.apis.models import ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
 from llama_stack.distribution.datatypes import (
-    VectorDBWithACL,
+    VectorDBWithOwner,
 )
 from llama_stack.log import get_logger
 
@@ -63,7 +63,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
             "embedding_model": embedding_model,
             "embedding_dimension": model.metadata["embedding_dimension"],
         }
-        vector_db = TypeAdapter(VectorDBWithACL).validate_python(vector_db_data)
+        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
         await self.register_object(vector_db)
         return vector_db
 
diff --git a/llama_stack/distribution/server/auth.py b/llama_stack/distribution/server/auth.py
index fb26b49a7..81b1ffd37 100644
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@@ -105,24 +105,16 @@ class AuthenticationMiddleware:
                 logger.exception("Error during authentication")
                 return await self._send_auth_error(send, "Authentication service error")
 
-            # Store attributes in request scope for access control
-            if validation_result.access_attributes:
-                user_attributes = validation_result.access_attributes.model_dump(exclude_none=True)
-            else:
-                logger.warning("No access attributes, setting namespace to token by default")
-                user_attributes = {
-                    "roles": [token],
-                }
-
             # Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
             # can identify the requester and enforce per-client rate limits.
             scope["authenticated_client_id"] = token
 
             # Store attributes in request scope
-            scope["user_attributes"] = user_attributes
             scope["principal"] = validation_result.principal
+            if validation_result.attributes:
+                scope["user_attributes"] = validation_result.attributes
             logger.debug(
-                f"Authentication successful: {validation_result.principal} with {len(scope['user_attributes'])} attributes"
+                f"Authentication successful: {validation_result.principal} with {len(validation_result.attributes)} attributes"
             )
 
         return await self.app(scope, receive, send)
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
index 723a65b77..942ff8a18 100644
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -16,43 +16,18 @@ from jose import jwt
 from pydantic import BaseModel, Field, field_validator, model_validator
 from typing_extensions import Self
 
-from llama_stack.distribution.datatypes import AccessAttributes, AuthenticationConfig, AuthProviderType
+from llama_stack.distribution.datatypes import AuthenticationConfig, AuthProviderType, User
 from llama_stack.log import get_logger
 
 logger = get_logger(name=__name__, category="auth")
 
 
-class TokenValidationResult(BaseModel):
-    principal: str | None = Field(
-        default=None,
-        description="The principal (username or persistent identifier) of the authenticated user",
-    )
-    access_attributes: AccessAttributes | None = Field(
-        default=None,
-        description="""
-        Structured user attributes for attribute-based access control.
-
-        These attributes determine which resources the user can access.
-        The model provides standard categories like "roles", "teams", "projects", and "namespaces".
-        Each attribute category contains a list of values that the user has for that category.
-        During access control checks, these values are compared against resource requirements.
-
-        Example with standard categories:
-        ```json
-        {
-            "roles": ["admin", "data-scientist"],
-            "teams": ["ml-team"],
-            "projects": ["llama-3"],
-            "namespaces": ["research"]
-        }
-        ```
-        """,
-    )
-
-
-class AuthResponse(TokenValidationResult):
+class AuthResponse(BaseModel):
     """The format of the authentication response from the auth endpoint."""
 
+    principal: str
+    # further attributes that may be used for access control decisions
+    attributes: dict[str, list[str]] | None = None
     message: str | None = Field(
         default=None, description="Optional message providing additional context about the authentication result."
     )
@@ -78,7 +53,7 @@ class AuthProvider(ABC):
     """Abstract base class for authentication providers."""
 
     @abstractmethod
-    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
         """Validate a token and return access attributes."""
         pass
 
@@ -88,10 +63,10 @@ class AuthProvider(ABC):
         pass
 
 
-def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> AccessAttributes:
-    attributes = AccessAttributes()
+def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> dict[str, list[str]]:
+    attributes: dict[str, list[str]] = {}
     for claim_key, attribute_key in mapping.items():
-        if claim_key not in claims or not hasattr(attributes, attribute_key):
+        if claim_key not in claims:
             continue
         claim = claims[claim_key]
         if isinstance(claim, list):
@@ -99,11 +74,10 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
         else:
             values = claim.split()
 
-        current = getattr(attributes, attribute_key)
-        if current:
-            current.extend(values)
+        if attribute_key in attributes:
+            attributes[attribute_key].extend(values)
         else:
-            setattr(attributes, attribute_key, values)
+            attributes[attribute_key] = values
     return attributes
 
 
@@ -145,8 +119,6 @@ class OAuth2TokenAuthProviderConfig(BaseModel):
         for key, value in v.items():
             if not value:
                 raise ValueError(f"claims_mapping value cannot be empty: {key}")
-            if value not in AccessAttributes.model_fields:
-                raise ValueError(f"claims_mapping value is not a valid attribute: {value}")
         return v
 
     @model_validator(mode="after")
@@ -171,14 +143,14 @@ class OAuth2TokenAuthProvider(AuthProvider):
         self._jwks: dict[str, str] = {}
         self._jwks_lock = Lock()
 
-    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
         if self.config.jwks:
             return await self.validate_jwt_token(token, scope)
         if self.config.introspection:
             return await self.introspect_token(token, scope)
         raise ValueError("One of jwks or introspection must be configured")
 
-    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> User:
         """Validate a token using the JWT token."""
         await self._refresh_jwks()
 
@@ -203,12 +175,12 @@ class OAuth2TokenAuthProvider(AuthProvider):
         # We should incorporate these into the access attributes.
         principal = claims["sub"]
         access_attributes = get_attributes_from_claims(claims, self.config.claims_mapping)
-        return TokenValidationResult(
+        return User(
             principal=principal,
-            access_attributes=access_attributes,
+            attributes=access_attributes,
         )
 
-    async def introspect_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+    async def introspect_token(self, token: str, scope: dict | None = None) -> User:
         """Validate a token using token introspection as defined by RFC 7662."""
         form = {
             "token": token,
@@ -242,9 +214,9 @@ class OAuth2TokenAuthProvider(AuthProvider):
                     raise ValueError("Token not active")
                 principal = fields["sub"] or fields["username"]
                 access_attributes = get_attributes_from_claims(fields, self.config.claims_mapping)
-                return TokenValidationResult(
+                return User(
                     principal=principal,
-                    access_attributes=access_attributes,
+                    attributes=access_attributes,
                 )
         except httpx.TimeoutException:
             logger.exception("Token introspection request timed out")
@@ -299,7 +271,7 @@ class CustomAuthProvider(AuthProvider):
         self.config = config
         self._client = None
 
-    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
         """Validate a token using the custom authentication endpoint."""
         if scope is None:
             scope = {}
@@ -341,7 +313,7 @@ class CustomAuthProvider(AuthProvider):
                 try:
                     response_data = response.json()
                     auth_response = AuthResponse(**response_data)
-                    return auth_response
+                    return User(auth_response.principal, auth_response.attributes)
                 except Exception as e:
                     logger.exception("Error parsing authentication response")
                     raise ValueError("Invalid authentication response format") from e
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 6c88bbfe9..4f2427a55 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -18,7 +18,7 @@ from collections.abc import Callable
 from contextlib import asynccontextmanager
 from importlib.metadata import version as parse_version
 from pathlib import Path
-from typing import Annotated, Any
+from typing import Annotated, Any, get_origin
 
 import rich.pretty
 import yaml
@@ -26,17 +26,13 @@ from aiohttp import hdrs
 from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
-from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
 
 from llama_stack.distribution.datatypes import AuthenticationRequiredError, LoggingConfig, StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
-from llama_stack.distribution.request_headers import (
-    PROVIDER_DATA_VAR,
-    request_provider_data_context,
-)
+from llama_stack.distribution.request_headers import PROVIDER_DATA_VAR, User, request_provider_data_context
 from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.server.routes import (
     find_matching_route,
@@ -217,11 +213,13 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
     async def route_handler(request: Request, **kwargs):
         # Get auth attributes from the request scope
         user_attributes = request.scope.get("user_attributes", {})
+        principal = request.scope.get("principal", "")
+        user = User(principal, user_attributes)
 
         await log_request_pre_validation(request)
 
         # Use context manager with both provider data and auth attributes
-        with request_provider_data_context(request.headers, user_attributes):
+        with request_provider_data_context(request.headers, user):
             is_streaming = is_streaming_request(func.__name__, request, **kwargs)
 
             try:
@@ -244,15 +242,23 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
 
     path_params = extract_path_params(route)
     if method == "post":
-        # Annotate parameters that are in the path with Path(...) and others with Body(...)
-        new_params = [new_params[0]] + [
-            (
-                param.replace(annotation=Annotated[param.annotation, FastapiPath(..., title=param.name)])
-                if param.name in path_params
-                else param.replace(annotation=Annotated[param.annotation, Body(..., embed=True)])
-            )
-            for param in new_params[1:]
-        ]
+        # Annotate parameters that are in the path with Path(...) and others with Body(...),
+        # but preserve existing File() and Form() annotations for multipart form data
+        new_params = (
+            [new_params[0]]
+            + [
+                (
+                    param.replace(annotation=Annotated[param.annotation, FastapiPath(..., title=param.name)])
+                    if param.name in path_params
+                    else (
+                        param  # Keep original annotation if it's already an Annotated type
+                        if get_origin(param.annotation) is Annotated
+                        else param.replace(annotation=Annotated[param.annotation, Body(..., embed=True)])
+                    )
+                )
+                for param in new_params[1:]
+            ]
+        )
 
     route_handler.__signature__ = sig.replace(parameters=new_params)
 
@@ -472,17 +478,6 @@ def main(args: argparse.Namespace | None = None):
             window_seconds=window_seconds,
         )
 
-    # --- CORS middleware for local development ---
-    # TODO: move to reverse proxy
-    ui_port = os.environ.get("LLAMA_STACK_UI_PORT", 8322)
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=[f"http://localhost:{ui_port}"],
-        allow_credentials=True,
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-
     try:
         impls = asyncio.run(construct_stack(config))
     except InvalidProviderError as e:
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index fc68dc016..5a9708497 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -223,7 +223,10 @@ async def construct_stack(
     run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None
 ) -> dict[Api, Any]:
     dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
-    impls = await resolve_impls(run_config, provider_registry or get_provider_registry(run_config), dist_registry)
+    policy = run_config.server.auth.access_policy if run_config.server.auth else []
+    impls = await resolve_impls(
+        run_config, provider_registry or get_provider_registry(run_config), dist_registry, policy
+    )
 
     # Add internal implementations after all other providers are resolved
     add_internal_implementations(impls, run_config)
diff --git a/llama_stack/distribution/start_stack.sh b/llama_stack/distribution/start_stack.sh
index 996935a5e..85bfceec4 100755
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@@ -7,10 +7,6 @@
 # the root directory of this source tree.
 
 
-CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
-CONTAINER_OPTS=${CONTAINER_OPTS:-}
-LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}
-LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 PYPI_VERSION=${PYPI_VERSION:-}
 VIRTUAL_ENV=${VIRTUAL_ENV:-}
@@ -132,63 +128,7 @@ if [[ "$env_type" == "venv" || "$env_type" == "conda" ]]; then
     $env_vars \
     $other_args
 elif [[ "$env_type" == "container" ]]; then
-    set -x
-
-    # Check if container command is available
-    if ! is_command_available $CONTAINER_BINARY; then
-      printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
-      exit 1
-    fi
-
-    if is_command_available selinuxenabled &> /dev/null && selinuxenabled; then
-        # Disable SELinux labels
-        CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
-    fi
-
-    mounts=""
-    if [ -n "$LLAMA_STACK_DIR" ]; then
-        mounts="$mounts -v $(readlink -f $LLAMA_STACK_DIR):/app/llama-stack-source"
-    fi
-    if [ -n "$LLAMA_CHECKPOINT_DIR" ]; then
-        mounts="$mounts -v $LLAMA_CHECKPOINT_DIR:/root/.llama"
-        CONTAINER_OPTS="$CONTAINER_OPTS --gpus=all"
-    fi
-
-    if [ -n "$PYPI_VERSION" ]; then
-        version_tag="$PYPI_VERSION"
-    elif [ -n "$LLAMA_STACK_DIR" ]; then
-        version_tag="dev"
-    elif [ -n "$TEST_PYPI_VERSION" ]; then
-        version_tag="test-$TEST_PYPI_VERSION"
-    else
-        if ! is_command_available jq; then
-            echo -e "${RED}Error: jq not found" >&2
-            exit 1
-        fi
-        URL="https://pypi.org/pypi/llama-stack/json"
-        version_tag=$(curl -s $URL | jq -r '.info.version')
-    fi
-
-    # Build the command with optional yaml config
-    cmd="$CONTAINER_BINARY run $CONTAINER_OPTS -it \
-    -p $port:$port \
-    $env_vars \
-    $mounts \
-    --env LLAMA_STACK_PORT=$port \
-    --entrypoint python \
-    $container_image:$version_tag \
-    -m llama_stack.distribution.server.server"
-
-    # Add yaml config if provided, otherwise use default
-    if [ -n "$yaml_config" ]; then
-        cmd="$cmd -v $yaml_config:/app/run.yaml --config /app/run.yaml"
-    else
-        cmd="$cmd --config /app/run.yaml"
-    fi
-
-    # Add any other args
-    cmd="$cmd $other_args"
-
-    # Execute the command
-    eval $cmd
+    echo -e "${RED}Warning: Llama Stack no longer supports running Containers via the 'llama stack run' command.${NC}"
+    echo -e "Please refer to the documentation for more information: https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html#llama-stack-build"
+    exit 1
 fi
diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py
index 7c2e00524..2db01689f 100644
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@@ -23,11 +23,8 @@ from llama_stack.distribution.utils.image_types import LlamaStackImageType
 
 def formulate_run_args(image_type, image_name, config, template_name) -> list:
     env_name = ""
-    if image_type == LlamaStackImageType.CONTAINER.value:
-        env_name = (
-            f"distribution-{template_name}" if template_name else (config.container_image if config else image_name)
-        )
-    elif image_type == LlamaStackImageType.CONDA.value:
+
+    if image_type == LlamaStackImageType.CONDA.value:
         current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
         env_name = image_name or current_conda_env
         if not env_name:
diff --git a/llama_stack/models/llama/llama3/tokenizer.py b/llama_stack/models/llama/llama3/tokenizer.py
index e5ada3599..e47b579e3 100644
--- a/llama_stack/models/llama/llama3/tokenizer.py
+++ b/llama_stack/models/llama/llama3/tokenizer.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import os
 from collections.abc import Collection, Iterator, Sequence, Set
 from logging import getLogger
 from pathlib import Path
@@ -14,7 +13,8 @@ from typing import (
 )
 
 import tiktoken
-from tiktoken.load import load_tiktoken_bpe
+
+from llama_stack.models.llama.tokenizer_utils import load_bpe_file
 
 logger = getLogger(__name__)
 
@@ -48,19 +48,20 @@ class Tokenizer:
         global _INSTANCE
 
         if _INSTANCE is None:
-            _INSTANCE = Tokenizer(os.path.join(os.path.dirname(__file__), "tokenizer.model"))
+            _INSTANCE = Tokenizer(Path(__file__).parent / "tokenizer.model")
         return _INSTANCE
 
-    def __init__(self, model_path: str):
+    def __init__(self, model_path: Path):
         """
         Initializes the Tokenizer with a Tiktoken model.
 
         Args:
             model_path (str): The path to the Tiktoken model file.
         """
-        assert os.path.isfile(model_path), model_path
+        if not model_path.exists():
+            raise FileNotFoundError(f"Tokenizer model file not found: {model_path}")
 
-        mergeable_ranks = load_tiktoken_bpe(model_path)
+        mergeable_ranks = load_bpe_file(model_path)
         num_base_tokens = len(mergeable_ranks)
         special_tokens = [
             "<|begin_of_text|>",
@@ -83,7 +84,7 @@ class Tokenizer:
 
         self.special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
         self.model = tiktoken.Encoding(
-            name=Path(model_path).name,
+            name=model_path.name,
             pat_str=self.pat_str,
             mergeable_ranks=mergeable_ranks,
             special_tokens=self.special_tokens,
diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py
index 74070d43e..e12b2cae0 100644
--- a/llama_stack/models/llama/llama4/tokenizer.py
+++ b/llama_stack/models/llama/llama4/tokenizer.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import os
 from collections.abc import Collection, Iterator, Sequence, Set
 from logging import getLogger
 from pathlib import Path
@@ -14,7 +13,8 @@ from typing import (
 )
 
 import tiktoken
-from tiktoken.load import load_tiktoken_bpe
+
+from llama_stack.models.llama.tokenizer_utils import load_bpe_file
 
 logger = getLogger(__name__)
 
@@ -118,19 +118,20 @@ class Tokenizer:
         global _INSTANCE
 
         if _INSTANCE is None:
-            _INSTANCE = Tokenizer(os.path.join(os.path.dirname(__file__), "tokenizer.model"))
+            _INSTANCE = Tokenizer(Path(__file__).parent / "tokenizer.model")
         return _INSTANCE
 
-    def __init__(self, model_path: str):
+    def __init__(self, model_path: Path):
         """
         Initializes the Tokenizer with a Tiktoken model.
 
         Args:
-            model_path (str): The path to the Tiktoken model file.
+            model_path (Path): The path to the Tiktoken model file.
         """
-        assert os.path.isfile(model_path), model_path
+        if not model_path.exists():
+            raise FileNotFoundError(f"Tokenizer model file not found: {model_path}")
 
-        mergeable_ranks = load_tiktoken_bpe(model_path)
+        mergeable_ranks = load_bpe_file(model_path)
         num_base_tokens = len(mergeable_ranks)
 
         special_tokens = BASIC_SPECIAL_TOKENS + LLAMA4_SPECIAL_TOKENS
@@ -144,7 +145,7 @@ class Tokenizer:
 
         self.special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
         self.model = tiktoken.Encoding(
-            name=Path(model_path).name,
+            name=model_path.name,
             pat_str=self.O200K_PATTERN,
             mergeable_ranks=mergeable_ranks,
             special_tokens=self.special_tokens,
diff --git a/llama_stack/models/llama/tokenizer_utils.py b/llama_stack/models/llama/tokenizer_utils.py
new file mode 100644
index 000000000..9830bb61b
--- /dev/null
+++ b/llama_stack/models/llama/tokenizer_utils.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+from pathlib import Path
+
+from llama_stack.log import get_logger
+
+logger = get_logger(__name__, "tokenizer_utils")
+
+
+def load_bpe_file(model_path: Path) -> dict[bytes, int]:
+    """
+    Load BPE file directly and return mergeable ranks.
+
+    Args:
+        model_path (Path): Path to the BPE model file.
+
+    Returns:
+        dict[bytes, int]: Dictionary mapping byte sequences to their ranks.
+    """
+    mergeable_ranks = {}
+
+    with open(model_path, encoding="utf-8") as f:
+        content = f.read()
+
+    for line in content.splitlines():
+        if not line.strip():  # Skip empty lines
+            continue
+        try:
+            token, rank = line.split()
+            mergeable_ranks[base64.b64decode(token)] = int(rank)
+        except Exception as e:
+            logger.warning(f"Failed to parse line '{line}': {e}")
+            continue
+
+    return mergeable_ranks
diff --git a/llama_stack/providers/inline/agents/meta_reference/__init__.py b/llama_stack/providers/inline/agents/meta_reference/__init__.py
index 7503b8c90..4a77e65b9 100644
--- a/llama_stack/providers/inline/agents/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/agents/meta_reference/__init__.py
@@ -6,12 +6,12 @@
 
 from typing import Any
 
-from llama_stack.distribution.datatypes import Api
+from llama_stack.distribution.datatypes import AccessRule, Api
 
 from .config import MetaReferenceAgentsImplConfig
 
 
-async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Api, Any]):
+async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Api, Any], policy: list[AccessRule]):
     from .agents import MetaReferenceAgentsImpl
 
     impl = MetaReferenceAgentsImpl(
@@ -21,6 +21,7 @@ async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Ap
         deps[Api.safety],
         deps[Api.tool_runtime],
         deps[Api.tool_groups],
+        policy,
     )
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 2e387e7e8..937bd0341 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -60,6 +60,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.distribution.datatypes import AccessRule
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
     BuiltinTool,
@@ -96,13 +97,14 @@ class ChatAgent(ShieldRunnerMixin):
         vector_io_api: VectorIO,
         persistence_store: KVStore,
         created_at: str,
+        policy: list[AccessRule],
     ):
         self.agent_id = agent_id
         self.agent_config = agent_config
         self.inference_api = inference_api
         self.safety_api = safety_api
         self.vector_io_api = vector_io_api
-        self.storage = AgentPersistence(agent_id, persistence_store)
+        self.storage = AgentPersistence(agent_id, persistence_store, policy)
         self.tool_runtime_api = tool_runtime_api
         self.tool_groups_api = tool_groups_api
         self.created_at = created_at
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index bcbfcbe31..ea3c5da97 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -29,6 +29,7 @@ from llama_stack.apis.agents import (
     Session,
     Turn,
 )
+from llama_stack.apis.agents.openai_responses import OpenAIResponseText
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
     Inference,
@@ -40,6 +41,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.distribution.datatypes import AccessRule
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
 from llama_stack.providers.utils.pagination import paginate_records
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
@@ -61,6 +63,7 @@ class MetaReferenceAgentsImpl(Agents):
         safety_api: Safety,
         tool_runtime_api: ToolRuntime,
         tool_groups_api: ToolGroups,
+        policy: list[AccessRule],
     ):
         self.config = config
         self.inference_api = inference_api
@@ -71,6 +74,7 @@ class MetaReferenceAgentsImpl(Agents):
 
         self.in_memory_store = InmemoryKVStoreImpl()
         self.openai_responses_impl: OpenAIResponsesImpl | None = None
+        self.policy = policy
 
     async def initialize(self) -> None:
         self.persistence_store = await kvstore_impl(self.config.persistence_store)
@@ -129,6 +133,7 @@ class MetaReferenceAgentsImpl(Agents):
                 self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
             ),
             created_at=agent_info.created_at,
+            policy=self.policy,
         )
 
     async def create_agent_session(
@@ -324,10 +329,12 @@ class MetaReferenceAgentsImpl(Agents):
         store: bool | None = True,
         stream: bool | None = False,
         temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
         tools: list[OpenAIResponseInputTool] | None = None,
+        max_infer_iters: int | None = 10,
     ) -> OpenAIResponseObject:
         return await self.openai_responses_impl.create_openai_response(
-            input, model, instructions, previous_response_id, store, stream, temperature, tools
+            input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters
         )
 
     async def list_openai_responses(
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index 1fcb1c461..0ff6dc2c5 100644
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -8,7 +8,7 @@ import json
 import time
 import uuid
 from collections.abc import AsyncIterator
-from typing import Any, cast
+from typing import Any
 
 from openai.types.chat import ChatCompletionToolParam
 from pydantic import BaseModel
@@ -37,6 +37,8 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseOutputMessageFunctionToolCall,
     OpenAIResponseOutputMessageMCPListTools,
     OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseText,
+    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
     Inference,
@@ -50,7 +52,12 @@ from llama_stack.apis.inference.inference import (
     OpenAIChoice,
     OpenAIDeveloperMessageParam,
     OpenAIImageURL,
+    OpenAIJSONSchema,
     OpenAIMessageParam,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatParam,
+    OpenAIResponseFormatText,
     OpenAISystemMessageParam,
     OpenAIToolMessageParam,
     OpenAIUserMessageParam,
@@ -158,6 +165,21 @@ async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> Open
     )
 
 
+async def _convert_response_text_to_chat_response_format(text: OpenAIResponseText) -> OpenAIResponseFormatParam:
+    """
+    Convert an OpenAI Response text parameter into an OpenAI Chat Completion response format.
+    """
+    if not text.format or text.format["type"] == "text":
+        return OpenAIResponseFormatText(type="text")
+    if text.format["type"] == "json_object":
+        return OpenAIResponseFormatJSONObject()
+    if text.format["type"] == "json_schema":
+        return OpenAIResponseFormatJSONSchema(
+            json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"])
+        )
+    raise ValueError(f"Unsupported text format: {text.format}")
+
+
 async def _get_message_type_by_role(role: str):
     role_to_type = {
         "user": OpenAIUserMessageParam,
@@ -178,8 +200,8 @@ class ChatCompletionContext(BaseModel):
     messages: list[OpenAIMessageParam]
     tools: list[ChatCompletionToolParam] | None = None
     mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
-    stream: bool
     temperature: float | None
+    response_format: OpenAIResponseFormatParam
 
 
 class OpenAIResponsesImpl:
@@ -258,37 +280,6 @@ class OpenAIResponsesImpl:
         """
         return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
 
-    async def _process_response_choices(
-        self,
-        chat_response: OpenAIChatCompletion,
-        ctx: ChatCompletionContext,
-        tools: list[OpenAIResponseInputTool] | None,
-    ) -> list[OpenAIResponseOutput]:
-        """Handle tool execution and response message creation."""
-        output_messages: list[OpenAIResponseOutput] = []
-        # Execute tool calls if any
-        for choice in chat_response.choices:
-            if choice.message.tool_calls and tools:
-                # Assume if the first tool is a function, all tools are functions
-                if tools[0].type == "function":
-                    for tool_call in choice.message.tool_calls:
-                        output_messages.append(
-                            OpenAIResponseOutputMessageFunctionToolCall(
-                                arguments=tool_call.function.arguments or "",
-                                call_id=tool_call.id,
-                                name=tool_call.function.name or "",
-                                id=f"fc_{uuid.uuid4()}",
-                                status="completed",
-                            )
-                        )
-                else:
-                    tool_messages = await self._execute_tool_and_return_final_output(choice, ctx)
-                    output_messages.extend(tool_messages)
-            else:
-                output_messages.append(await _convert_chat_choice_to_response_message(choice))
-
-        return output_messages
-
     async def _store_response(
         self,
         response: OpenAIResponseObject,
@@ -331,10 +322,52 @@ class OpenAIResponsesImpl:
         store: bool | None = True,
         stream: bool | None = False,
         temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
         tools: list[OpenAIResponseInputTool] | None = None,
+        max_infer_iters: int | None = 10,
     ):
-        stream = False if stream is None else stream
+        stream = bool(stream)
+        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
 
+        stream_gen = self._create_streaming_response(
+            input=input,
+            model=model,
+            instructions=instructions,
+            previous_response_id=previous_response_id,
+            store=store,
+            temperature=temperature,
+            text=text,
+            tools=tools,
+            max_infer_iters=max_infer_iters,
+        )
+
+        if stream:
+            return stream_gen
+        else:
+            response = None
+            async for stream_chunk in stream_gen:
+                if stream_chunk.type == "response.completed":
+                    if response is not None:
+                        raise ValueError("The response stream completed multiple times! Earlier response: {response}")
+                    response = stream_chunk.response
+                    # don't leave the generator half complete!
+
+            if response is None:
+                raise ValueError("The response stream never completed")
+            return response
+
+    async def _create_streaming_response(
+        self,
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        instructions: str | None = None,
+        previous_response_id: str | None = None,
+        store: bool | None = True,
+        temperature: float | None = None,
+        text: OpenAIResponseText | None = None,
+        tools: list[OpenAIResponseInputTool] | None = None,
+        max_infer_iters: int | None = 10,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
         output_messages: list[OpenAIResponseOutput] = []
 
         # Input preprocessing
@@ -342,7 +375,10 @@ class OpenAIResponsesImpl:
         messages = await _convert_response_input_to_chat_messages(input)
         await self._prepend_instructions(messages, instructions)
 
-        # Tool setup
+        # Structured outputs
+        response_format = await _convert_response_text_to_chat_response_format(text)
+
+        # Tool setup, TODO: refactor this slightly since this can also yield events
         chat_tools, mcp_tool_to_server, mcp_list_message = (
             await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
         )
@@ -354,89 +390,10 @@ class OpenAIResponsesImpl:
             messages=messages,
             tools=chat_tools,
             mcp_tool_to_server=mcp_tool_to_server,
-            stream=stream,
             temperature=temperature,
+            response_format=response_format,
         )
 
-        inference_result = await self.inference_api.openai_chat_completion(
-            model=model,
-            messages=messages,
-            tools=chat_tools,
-            stream=stream,
-            temperature=temperature,
-        )
-
-        if stream:
-            return self._create_streaming_response(
-                inference_result=inference_result,
-                ctx=ctx,
-                output_messages=output_messages,
-                input=input,
-                model=model,
-                store=store,
-                tools=tools,
-            )
-        else:
-            return await self._create_non_streaming_response(
-                inference_result=inference_result,
-                ctx=ctx,
-                output_messages=output_messages,
-                input=input,
-                model=model,
-                store=store,
-                tools=tools,
-            )
-
-    async def _create_non_streaming_response(
-        self,
-        inference_result: Any,
-        ctx: ChatCompletionContext,
-        output_messages: list[OpenAIResponseOutput],
-        input: str | list[OpenAIResponseInput],
-        model: str,
-        store: bool | None,
-        tools: list[OpenAIResponseInputTool] | None,
-    ) -> OpenAIResponseObject:
-        chat_response = OpenAIChatCompletion(**inference_result.model_dump())
-
-        # Process response choices (tool execution and message creation)
-        output_messages.extend(
-            await self._process_response_choices(
-                chat_response=chat_response,
-                ctx=ctx,
-                tools=tools,
-            )
-        )
-
-        response = OpenAIResponseObject(
-            created_at=chat_response.created,
-            id=f"resp-{uuid.uuid4()}",
-            model=model,
-            object="response",
-            status="completed",
-            output=output_messages,
-        )
-        logger.debug(f"OpenAI Responses response: {response}")
-
-        # Store response if requested
-        if store:
-            await self._store_response(
-                response=response,
-                input=input,
-            )
-
-        return response
-
-    async def _create_streaming_response(
-        self,
-        inference_result: Any,
-        ctx: ChatCompletionContext,
-        output_messages: list[OpenAIResponseOutput],
-        input: str | list[OpenAIResponseInput],
-        model: str,
-        store: bool | None,
-        tools: list[OpenAIResponseInputTool] | None,
-    ) -> AsyncIterator[OpenAIResponseObjectStream]:
         # Create initial response and emit response.created immediately
         response_id = f"resp-{uuid.uuid4()}"
         created_at = int(time.time())
@@ -448,87 +405,144 @@ class OpenAIResponsesImpl:
             object="response",
             status="in_progress",
             output=output_messages.copy(),
+            text=text,
         )
 
-        # Emit response.created immediately
         yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
 
-        # For streaming, inference_result is an async iterator of chunks
-        # Stream chunks and emit delta events as they arrive
-        chat_response_id = ""
-        chat_response_content = []
-        chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
-        chunk_created = 0
-        chunk_model = ""
-        chunk_finish_reason = ""
-        sequence_number = 0
+        n_iter = 0
+        messages = ctx.messages.copy()
 
-        # Create a placeholder message item for delta events
-        message_item_id = f"msg_{uuid.uuid4()}"
-
-        async for chunk in inference_result:
-            chat_response_id = chunk.id
-            chunk_created = chunk.created
-            chunk_model = chunk.model
-            for chunk_choice in chunk.choices:
-                # Emit incremental text content as delta events
-                if chunk_choice.delta.content:
-                    sequence_number += 1
-                    yield OpenAIResponseObjectStreamResponseOutputTextDelta(
-                        content_index=0,
-                        delta=chunk_choice.delta.content,
-                        item_id=message_item_id,
-                        output_index=0,
-                        sequence_number=sequence_number,
-                    )
-
-                # Collect content for final response
-                chat_response_content.append(chunk_choice.delta.content or "")
-                if chunk_choice.finish_reason:
-                    chunk_finish_reason = chunk_choice.finish_reason
-
-                # Aggregate tool call arguments across chunks, using their index as the aggregation key
-                if chunk_choice.delta.tool_calls:
-                    for tool_call in chunk_choice.delta.tool_calls:
-                        response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
-                        if response_tool_call:
-                            response_tool_call.function.arguments += tool_call.function.arguments
-                        else:
-                            tool_call_dict: dict[str, Any] = tool_call.model_dump()
-                            tool_call_dict.pop("type", None)
-                            response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
-                        chat_response_tool_calls[tool_call.index] = response_tool_call
-
-        # Convert collected chunks to complete response
-        if chat_response_tool_calls:
-            tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
-        else:
-            tool_calls = None
-        assistant_message = OpenAIAssistantMessageParam(
-            content="".join(chat_response_content),
-            tool_calls=tool_calls,
-        )
-        chat_response_obj = OpenAIChatCompletion(
-            id=chat_response_id,
-            choices=[
-                OpenAIChoice(
-                    message=assistant_message,
-                    finish_reason=chunk_finish_reason,
-                    index=0,
-                )
-            ],
-            created=chunk_created,
-            model=chunk_model,
-        )
-
-        # Process response choices (tool execution and message creation)
-        output_messages.extend(
-            await self._process_response_choices(
-                chat_response=chat_response_obj,
-                ctx=ctx,
-                tools=tools,
+        while True:
+            completion_result = await self.inference_api.openai_chat_completion(
+                model=ctx.model,
+                messages=messages,
+                tools=ctx.tools,
+                stream=True,
+                temperature=ctx.temperature,
+                response_format=ctx.response_format,
             )
-        )
+
+            # Process streaming chunks and build complete response
+            chat_response_id = ""
+            chat_response_content = []
+            chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
+            chunk_created = 0
+            chunk_model = ""
+            chunk_finish_reason = ""
+            sequence_number = 0
+
+            # Create a placeholder message item for delta events
+            message_item_id = f"msg_{uuid.uuid4()}"
+
+            async for chunk in completion_result:
+                chat_response_id = chunk.id
+                chunk_created = chunk.created
+                chunk_model = chunk.model
+                for chunk_choice in chunk.choices:
+                    # Emit incremental text content as delta events
+                    if chunk_choice.delta.content:
+                        sequence_number += 1
+                        yield OpenAIResponseObjectStreamResponseOutputTextDelta(
+                            content_index=0,
+                            delta=chunk_choice.delta.content,
+                            item_id=message_item_id,
+                            output_index=0,
+                            sequence_number=sequence_number,
+                        )
+
+                    # Collect content for final response
+                    chat_response_content.append(chunk_choice.delta.content or "")
+                    if chunk_choice.finish_reason:
+                        chunk_finish_reason = chunk_choice.finish_reason
+
+                    # Aggregate tool call arguments across chunks
+                    if chunk_choice.delta.tool_calls:
+                        for tool_call in chunk_choice.delta.tool_calls:
+                            response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
+                            if response_tool_call:
+                                # Don't attempt to concatenate arguments if we don't have any new argumentsAdd commentMore actions
+                                if tool_call.function.arguments:
+                                    # Guard against an initial None argument before we concatenate
+                                    response_tool_call.function.arguments = (
+                                        response_tool_call.function.arguments or ""
+                                    ) + tool_call.function.arguments
+                            else:
+                                tool_call_dict: dict[str, Any] = tool_call.model_dump()
+                                tool_call_dict.pop("type", None)
+                                response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
+                            chat_response_tool_calls[tool_call.index] = response_tool_call
+
+            # Convert collected chunks to complete response
+            if chat_response_tool_calls:
+                tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
+            else:
+                tool_calls = None
+            assistant_message = OpenAIAssistantMessageParam(
+                content="".join(chat_response_content),
+                tool_calls=tool_calls,
+            )
+            current_response = OpenAIChatCompletion(
+                id=chat_response_id,
+                choices=[
+                    OpenAIChoice(
+                        message=assistant_message,
+                        finish_reason=chunk_finish_reason,
+                        index=0,
+                    )
+                ],
+                created=chunk_created,
+                model=chunk_model,
+            )
+
+            function_tool_calls = []
+            non_function_tool_calls = []
+
+            next_turn_messages = messages.copy()
+            for choice in current_response.choices:
+                next_turn_messages.append(choice.message)
+
+                if choice.message.tool_calls and tools:
+                    for tool_call in choice.message.tool_calls:
+                        if _is_function_tool_call(tool_call, tools):
+                            function_tool_calls.append(tool_call)
+                        else:
+                            non_function_tool_calls.append(tool_call)
+                else:
+                    output_messages.append(await _convert_chat_choice_to_response_message(choice))
+
+            # execute non-function tool calls
+            for tool_call in non_function_tool_calls:
+                tool_call_log, tool_response_message = await self._execute_tool_call(tool_call, ctx)
+                if tool_call_log:
+                    output_messages.append(tool_call_log)
+                if tool_response_message:
+                    next_turn_messages.append(tool_response_message)
+
+            for tool_call in function_tool_calls:
+                output_messages.append(
+                    OpenAIResponseOutputMessageFunctionToolCall(
+                        arguments=tool_call.function.arguments or "",
+                        call_id=tool_call.id,
+                        name=tool_call.function.name or "",
+                        id=f"fc_{uuid.uuid4()}",
+                        status="completed",
+                    )
+                )
+
+            if not function_tool_calls and not non_function_tool_calls:
+                break
+
+            if function_tool_calls:
+                logger.info("Exiting inference loop since there is a function (client-side) tool call")
+                break
+
+            n_iter += 1
+            if n_iter >= max_infer_iters:
+                logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}")
+                break
+
+            messages = next_turn_messages
 
         # Create final response
         final_response = OpenAIResponseObject(
@@ -537,18 +551,19 @@ class OpenAIResponsesImpl:
             model=model,
             object="response",
             status="completed",
+            text=text,
             output=output_messages,
         )
 
+        # Emit response.completed
+        yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
+
         if store:
             await self._store_response(
                 response=final_response,
                 input=input,
             )
 
-        # Emit response.completed
-        yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
-
     async def _convert_response_tools_to_chat_tools(
         self, tools: list[OpenAIResponseInputTool]
     ) -> tuple[
@@ -641,49 +656,6 @@ class OpenAIResponsesImpl:
                 raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
         return chat_tools, mcp_tool_to_server, mcp_list_message
 
-    async def _execute_tool_and_return_final_output(
-        self,
-        choice: OpenAIChoice,
-        ctx: ChatCompletionContext,
-    ) -> list[OpenAIResponseOutput]:
-        output_messages: list[OpenAIResponseOutput] = []
-
-        if not isinstance(choice.message, OpenAIAssistantMessageParam):
-            return output_messages
-
-        if not choice.message.tool_calls:
-            return output_messages
-
-        next_turn_messages = ctx.messages.copy()
-
-        # Add the assistant message with tool_calls response to the messages list
-        next_turn_messages.append(choice.message)
-
-        for tool_call in choice.message.tool_calls:
-            # TODO: telemetry spans for tool calls
-            tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
-            if tool_call_log:
-                output_messages.append(tool_call_log)
-            if further_input:
-                next_turn_messages.append(further_input)
-
-        tool_results_chat_response = await self.inference_api.openai_chat_completion(
-            model=ctx.model,
-            messages=next_turn_messages,
-            stream=ctx.stream,
-            temperature=ctx.temperature,
-        )
-        # type cast to appease mypy: this is needed because we don't handle streaming properly :)
-        tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
-
-        # Huge TODO: these are NOT the final outputs, we must keep the loop going
-        tool_final_outputs = [
-            await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
-        ]
-        # TODO: Wire in annotations with URLs, titles, etc to these output messages
-        output_messages.extend(tool_final_outputs)
-        return output_messages
-
     async def _execute_tool_call(
         self,
         tool_call: OpenAIChatCompletionToolCall,
@@ -767,5 +739,20 @@ class OpenAIResponsesImpl:
             else:
                 raise ValueError(f"Unknown result content type: {type(result.content)}")
             input_message = OpenAIToolMessageParam(content=content, tool_call_id=tool_call_id)
+        else:
+            text = str(error_exc)
+            input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id)
 
         return message, input_message
+
+
+def _is_function_tool_call(
+    tool_call: OpenAIChatCompletionToolCall,
+    tools: list[OpenAIResponseInputTool],
+) -> bool:
+    if not tool_call.function:
+        return False
+    for t in tools:
+        if t.type == "function" and t.name == tool_call.function.name:
+            return True
+    return False
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 5031a4a90..25dbb5df7 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -10,9 +10,10 @@ import uuid
 from datetime import datetime, timezone
 
 from llama_stack.apis.agents import AgentConfig, Session, ToolExecutionStep, Turn
-from llama_stack.distribution.access_control import check_access
-from llama_stack.distribution.datatypes import AccessAttributes
-from llama_stack.distribution.request_headers import get_auth_attributes
+from llama_stack.distribution.access_control.access_control import AccessDeniedError, is_action_allowed
+from llama_stack.distribution.access_control.datatypes import AccessRule
+from llama_stack.distribution.datatypes import User
+from llama_stack.distribution.request_headers import get_authenticated_user
 from llama_stack.providers.utils.kvstore import KVStore
 
 log = logging.getLogger(__name__)
@@ -22,7 +23,9 @@ class AgentSessionInfo(Session):
     # TODO: is this used anywhere?
     vector_db_id: str | None = None
     started_at: datetime
-    access_attributes: AccessAttributes | None = None
+    owner: User | None = None
+    identifier: str | None = None
+    type: str = "session"
 
 
 class AgentInfo(AgentConfig):
@@ -30,24 +33,27 @@ class AgentInfo(AgentConfig):
 
 
 class AgentPersistence:
-    def __init__(self, agent_id: str, kvstore: KVStore):
+    def __init__(self, agent_id: str, kvstore: KVStore, policy: list[AccessRule]):
         self.agent_id = agent_id
         self.kvstore = kvstore
+        self.policy = policy
 
     async def create_session(self, name: str) -> str:
         session_id = str(uuid.uuid4())
 
         # Get current user's auth attributes for new sessions
-        auth_attributes = get_auth_attributes()
-        access_attributes = AccessAttributes(**auth_attributes) if auth_attributes else None
+        user = get_authenticated_user()
 
         session_info = AgentSessionInfo(
             session_id=session_id,
             session_name=name,
             started_at=datetime.now(timezone.utc),
-            access_attributes=access_attributes,
+            owner=user,
             turns=[],
+            identifier=name,  # should this be qualified in any way?
         )
+        if not is_action_allowed(self.policy, "create", session_info, user):
+            raise AccessDeniedError()
 
         await self.kvstore.set(
             key=f"session:{self.agent_id}:{session_id}",
@@ -73,10 +79,10 @@ class AgentPersistence:
     def _check_session_access(self, session_info: AgentSessionInfo) -> bool:
         """Check if current user has access to the session."""
         # Handle backward compatibility for old sessions without access control
-        if not hasattr(session_info, "access_attributes"):
+        if not hasattr(session_info, "access_attributes") and not hasattr(session_info, "owner"):
             return True
 
-        return check_access(session_info.session_id, session_info.access_attributes, get_auth_attributes())
+        return is_action_allowed(self.policy, "read", session_info, get_authenticated_user())
 
     async def get_session_if_accessible(self, session_id: str) -> AgentSessionInfo | None:
         """Get session info if the user has access to it. For internal use by sub-session methods."""
diff --git a/llama_stack/providers/inline/files/localfs/__init__.py b/llama_stack/providers/inline/files/localfs/__init__.py
new file mode 100644
index 000000000..7a04e61c6
--- /dev/null
+++ b/llama_stack/providers/inline/files/localfs/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.distribution.datatypes import Api
+
+from .config import LocalfsFilesImplConfig
+from .files import LocalfsFilesImpl
+
+__all__ = ["LocalfsFilesImpl", "LocalfsFilesImplConfig"]
+
+
+async def get_provider_impl(config: LocalfsFilesImplConfig, deps: dict[Api, Any]):
+    impl = LocalfsFilesImpl(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/inline/files/localfs/config.py b/llama_stack/providers/inline/files/localfs/config.py
new file mode 100644
index 000000000..757a70742
--- /dev/null
+++ b/llama_stack/providers/inline/files/localfs/config.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig
+
+
+class LocalfsFilesImplConfig(BaseModel):
+    storage_dir: str = Field(
+        description="Directory to store uploaded files",
+    )
+    metadata_store: SqlStoreConfig = Field(
+        description="SQL store configuration for file metadata",
+    )
+    ttl_secs: int = 365 * 24 * 60 * 60  # 1 year
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
+        return {
+            "storage_dir": "${env.FILES_STORAGE_DIR:" + __distro_dir__ + "/files}",
+            "metadata_store": SqliteSqlStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="files_metadata.db",
+            ),
+        }
diff --git a/llama_stack/providers/inline/files/localfs/files.py b/llama_stack/providers/inline/files/localfs/files.py
new file mode 100644
index 000000000..f2891c528
--- /dev/null
+++ b/llama_stack/providers/inline/files/localfs/files.py
@@ -0,0 +1,214 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+import uuid
+from pathlib import Path
+from typing import Annotated
+
+from fastapi import File, Form, Response, UploadFile
+
+from llama_stack.apis.common.responses import Order
+from llama_stack.apis.files import (
+    Files,
+    ListOpenAIFileResponse,
+    OpenAIFileDeleteResponse,
+    OpenAIFileObject,
+    OpenAIFilePurpose,
+)
+from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
+from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+
+from .config import LocalfsFilesImplConfig
+
+
+class LocalfsFilesImpl(Files):
+    def __init__(self, config: LocalfsFilesImplConfig) -> None:
+        self.config = config
+        self.sql_store: SqlStore | None = None
+
+    async def initialize(self) -> None:
+        """Initialize the files provider by setting up storage directory and metadata database."""
+        # Create storage directory if it doesn't exist
+        storage_path = Path(self.config.storage_dir)
+        storage_path.mkdir(parents=True, exist_ok=True)
+
+        # Initialize SQL store for metadata
+        self.sql_store = sqlstore_impl(self.config.metadata_store)
+        await self.sql_store.create_table(
+            "openai_files",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "filename": ColumnType.STRING,
+                "purpose": ColumnType.STRING,
+                "bytes": ColumnType.INTEGER,
+                "created_at": ColumnType.INTEGER,
+                "expires_at": ColumnType.INTEGER,
+                "file_path": ColumnType.STRING,  # Path to actual file on disk
+            },
+        )
+
+    def _generate_file_id(self) -> str:
+        """Generate a unique file ID for OpenAI API."""
+        return f"file-{uuid.uuid4().hex}"
+
+    def _get_file_path(self, file_id: str) -> Path:
+        """Get the filesystem path for a file ID."""
+        return Path(self.config.storage_dir) / file_id
+
+    # OpenAI Files API Implementation
+    async def openai_upload_file(
+        self,
+        file: Annotated[UploadFile, File()],
+        purpose: Annotated[OpenAIFilePurpose, Form()],
+    ) -> OpenAIFileObject:
+        """Upload a file that can be used across various endpoints."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        file_id = self._generate_file_id()
+        file_path = self._get_file_path(file_id)
+
+        content = await file.read()
+        file_size = len(content)
+
+        with open(file_path, "wb") as f:
+            f.write(content)
+
+        created_at = int(time.time())
+        expires_at = created_at + self.config.ttl_secs
+
+        await self.sql_store.insert(
+            "openai_files",
+            {
+                "id": file_id,
+                "filename": file.filename or "uploaded_file",
+                "purpose": purpose.value,
+                "bytes": file_size,
+                "created_at": created_at,
+                "expires_at": expires_at,
+                "file_path": file_path.as_posix(),
+            },
+        )
+
+        return OpenAIFileObject(
+            id=file_id,
+            filename=file.filename or "uploaded_file",
+            purpose=purpose,
+            bytes=file_size,
+            created_at=created_at,
+            expires_at=expires_at,
+        )
+
+    async def openai_list_files(
+        self,
+        after: str | None = None,
+        limit: int | None = 10000,
+        order: Order | None = Order.desc,
+        purpose: OpenAIFilePurpose | None = None,
+    ) -> ListOpenAIFileResponse:
+        """Returns a list of files that belong to the user's organization."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        # TODO: Implement 'after' pagination properly
+        if after:
+            raise NotImplementedError("After pagination not yet implemented")
+
+        where = None
+        if purpose:
+            where = {"purpose": purpose.value}
+
+        rows = await self.sql_store.fetch_all(
+            "openai_files",
+            where=where,
+            order_by=[("created_at", order.value if order else Order.desc.value)],
+            limit=limit,
+        )
+
+        files = [
+            OpenAIFileObject(
+                id=row["id"],
+                filename=row["filename"],
+                purpose=OpenAIFilePurpose(row["purpose"]),
+                bytes=row["bytes"],
+                created_at=row["created_at"],
+                expires_at=row["expires_at"],
+            )
+            for row in rows
+        ]
+
+        return ListOpenAIFileResponse(
+            data=files,
+            has_more=False,  # TODO: Implement proper pagination
+            first_id=files[0].id if files else "",
+            last_id=files[-1].id if files else "",
+        )
+
+    async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
+        """Returns information about a specific file."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ValueError(f"File with id {file_id} not found")
+
+        return OpenAIFileObject(
+            id=row["id"],
+            filename=row["filename"],
+            purpose=OpenAIFilePurpose(row["purpose"]),
+            bytes=row["bytes"],
+            created_at=row["created_at"],
+            expires_at=row["expires_at"],
+        )
+
+    async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
+        """Delete a file."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ValueError(f"File with id {file_id} not found")
+
+        # Delete physical file
+        file_path = Path(row["file_path"])
+        if file_path.exists():
+            file_path.unlink()
+
+        # Delete metadata from database
+        await self.sql_store.delete("openai_files", where={"id": file_id})
+
+        return OpenAIFileDeleteResponse(
+            id=file_id,
+            deleted=True,
+        )
+
+    async def openai_retrieve_file_content(self, file_id: str) -> Response:
+        """Returns the contents of the specified file."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        # Get file metadata
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ValueError(f"File with id {file_id} not found")
+
+        # Read file content
+        file_path = Path(row["file_path"])
+        if not file_path.exists():
+            raise ValueError(f"File content not found on disk: {file_path}")
+
+        with open(file_path, "rb") as f:
+            content = f.read()
+
+        # Return as binary response with appropriate content type
+        return Response(
+            content=content,
+            media_type="application/octet-stream",
+            headers={"Content-Disposition": f'attachment; filename="{row["filename"]}"'},
+        )
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 438cb14a0..bf54462b5 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -40,6 +40,7 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -410,6 +411,16 @@ class VLLMInferenceImpl(
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def chat_completion(
         self,
         model_id: str,
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/config.py b/llama_stack/providers/inline/telemetry/meta_reference/config.py
index af53bfd9c..93509040c 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@@ -30,7 +30,7 @@ class TelemetryConfig(BaseModel):
     )
     service_name: str = Field(
         # service name is always the same, use zero-width space to avoid clutter
-        default="",
+        default="\u200b",
         description="The service name to use for telemetry",
     )
     sinks: list[TelemetrySink] = Field(
@@ -52,7 +52,7 @@ class TelemetryConfig(BaseModel):
     @classmethod
     def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> dict[str, Any]:
         return {
-            "service_name": "${env.OTEL_SERVICE_NAME:}",
+            "service_name": "${env.OTEL_SERVICE_NAME:\u200b}",
             "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
             "sqlite_db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
         }
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index c2d264c91..4776d47d0 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -146,7 +146,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
         ]
         for i, chunk in enumerate(chunks):
             metadata = chunk.metadata
-            tokens += metadata["token_count"]
+            tokens += metadata.get("token_count", 0)
             tokens += metadata.get("metadata_token_count", 0)
 
             if tokens > query_config.max_tokens_in_context:
diff --git a/llama_stack/providers/registry/agents.py b/llama_stack/providers/registry/agents.py
index e0801a8d1..e47f84c65 100644
--- a/llama_stack/providers/registry/agents.py
+++ b/llama_stack/providers/registry/agents.py
@@ -24,7 +24,7 @@ def available_providers() -> list[ProviderSpec]:
                 "pandas",
                 "scikit-learn",
             ]
-            + kvstore_dependencies(),
+            + kvstore_dependencies(),  # TODO make this dynamic based on the kvstore config
             module="llama_stack.providers.inline.agents.meta_reference",
             config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
             api_dependencies=[
diff --git a/llama_stack/providers/registry/files.py b/llama_stack/providers/registry/files.py
index fb23436bb..dc5443c3a 100644
--- a/llama_stack/providers/registry/files.py
+++ b/llama_stack/providers/registry/files.py
@@ -4,8 +4,22 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.providers.datatypes import ProviderSpec
+from llama_stack.providers.datatypes import (
+    Api,
+    InlineProviderSpec,
+    ProviderSpec,
+)
+from llama_stack.providers.utils.sqlstore.sqlstore import sql_store_pip_packages
 
 
 def available_providers() -> list[ProviderSpec]:
-    return []
+    return [
+        InlineProviderSpec(
+            api=Api.files,
+            provider_type="inline::localfs",
+            # TODO: make this dynamic according to the sql store type
+            pip_packages=sql_store_pip_packages,
+            module="llama_stack.providers.inline.files.localfs",
+            config_class="llama_stack.providers.inline.files.localfs.config.LocalfsFilesImplConfig",
+        ),
+    ]
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 7b49ef09b..66f2e8bce 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -15,7 +15,6 @@ from llama_stack.providers.datatypes import (
 
 META_REFERENCE_DEPS = [
     "accelerate",
-    "blobfile",
     "fairscale",
     "torch",
     "torchvision",
diff --git a/llama_stack/providers/registry/tool_runtime.py b/llama_stack/providers/registry/tool_runtime.py
index 277914df2..fa359f6b5 100644
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@@ -20,7 +20,6 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.tool_runtime,
             provider_type="inline::rag-runtime",
             pip_packages=[
-                "blobfile",
                 "chardet",
                 "pypdf",
                 "tqdm",
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 0404a578f..952d86f1a 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -22,6 +22,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -197,3 +198,13 @@ class BedrockInferenceAdapter(
             response_body = json.loads(response.get("body").read())
             embeddings.append(response_body.get("embedding"))
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 685375346..952118e24 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -194,3 +195,13 @@ class CerebrasInferenceAdapter(
         task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 5c36eac3e..1dc18b97f 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -20,6 +20,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -152,3 +153,13 @@ class DatabricksInferenceAdapter(
         task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index b6d3984c6..75a9e33e2 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -37,6 +37,7 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingsResponse,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -254,7 +255,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         params = {
             "model": request.model,
             **input_dict,
-            "stream": request.stream,
+            "stream": bool(request.stream),
             **self._build_options(request.sampling_params, request.response_format, request.logprobs),
         }
         logger.debug(f"params to fireworks: {params}")
@@ -286,6 +287,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 333486fe4..4c68322e0 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -29,6 +29,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -238,6 +239,16 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         #
         return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def chat_completion(
         self,
         model_id: str,
diff --git a/llama_stack/providers/remote/inference/ollama/models.py b/llama_stack/providers/remote/inference/ollama/models.py
index 42e364105..8f0f0421a 100644
--- a/llama_stack/providers/remote/inference/ollama/models.py
+++ b/llama_stack/providers/remote/inference/ollama/models.py
@@ -12,7 +12,7 @@ from llama_stack.providers.utils.inference.model_registry import (
     build_model_entry,
 )
 
-model_entries = [
+MODEL_ENTRIES = [
     build_hf_repo_model_entry(
         "llama3.1:8b-instruct-fp16",
         CoreModelId.llama3_1_8b_instruct.value,
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 3b4287673..358a29d4c 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 
+import uuid
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
 
@@ -32,6 +33,7 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -76,7 +78,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
     request_has_media,
 )
 
-from .models import model_entries
+from .models import MODEL_ENTRIES
 
 logger = get_logger(name=__name__, category="inference")
 
@@ -86,7 +88,7 @@ class OllamaInferenceAdapter(
     ModelsProtocolPrivate,
 ):
     def __init__(self, url: str) -> None:
-        self.register_helper = ModelRegistryHelper(model_entries)
+        self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
         self.url = url
 
     @property
@@ -343,21 +345,27 @@ class OllamaInferenceAdapter(
             model = await self.register_helper.register_model(model)
         except ValueError:
             pass  # Ignore statically unknown model, will check live listing
+
+        if model.provider_resource_id is None:
+            raise ValueError("Model provider_resource_id cannot be None")
+
         if model.model_type == ModelType.embedding:
             logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
-            await self.client.pull(model.provider_resource_id)
+            # TODO: you should pull here only if the model is not found in a list
+            response = await self.client.list()
+            if model.provider_resource_id not in [m.model for m in response.models]:
+                await self.client.pull(model.provider_resource_id)
+
         # we use list() here instead of ps() -
         #  - ps() only lists running models, not available models
         #  - models not currently running are run by the ollama server as needed
         response = await self.client.list()
-        available_models = [m["model"] for m in response["models"]]
-        if model.provider_resource_id is None:
-            raise ValueError("Model provider_resource_id cannot be None")
+        available_models = [m.model for m in response.models]
         provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
         if provider_resource_id is None:
             provider_resource_id = model.provider_resource_id
         if provider_resource_id not in available_models:
-            available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]]
+            available_models_latest = [m.model.split(":latest")[0] for m in response.models]
             if provider_resource_id in available_models_latest:
                 logger.warning(
                     f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
@@ -370,6 +378,16 @@ class OllamaInferenceAdapter(
 
         return model
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
@@ -469,7 +487,25 @@ class OllamaInferenceAdapter(
             top_p=top_p,
             user=user,
         )
-        return await self.openai_client.chat.completions.create(**params)  # type: ignore
+        response = await self.openai_client.chat.completions.create(**params)
+        return await self._adjust_ollama_chat_completion_response_ids(response)
+
+    async def _adjust_ollama_chat_completion_response_ids(
+        self,
+        response: OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk],
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
+        id = f"chatcmpl-{uuid.uuid4()}"
+        if isinstance(response, AsyncIterator):
+
+            async def stream_with_chunk_ids() -> AsyncIterator[OpenAIChatCompletionChunk]:
+                async for chunk in response:
+                    chunk.id = id
+                    yield chunk
+
+            return stream_with_chunk_ids()
+        else:
+            response.id = id
+            return response
 
     async def batch_completion(
         self,
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index c3c25edd3..6f3a686a8 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -14,6 +14,9 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -38,6 +41,7 @@ logger = logging.getLogger(__name__)
 # | batch_chat_completion      | LiteLLMOpenAIMixin       |
 # | openai_completion          | AsyncOpenAI              |
 # | openai_chat_completion     | AsyncOpenAI              |
+# | openai_embeddings          | AsyncOpenAI              |
 #
 class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
     def __init__(self, config: OpenAIConfig) -> None:
@@ -171,3 +175,51 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             user=user,
         )
         return await self._openai_client.chat.completions.create(**params)
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        model_id = (await self.model_store.get_model(model)).provider_resource_id
+        if model_id.startswith("openai/"):
+            model_id = model_id[len("openai/") :]
+
+        # Prepare parameters for OpenAI embeddings API
+        params = {
+            "model": model_id,
+            "input": input,
+        }
+
+        if encoding_format is not None:
+            params["encoding_format"] = encoding_format
+        if dimensions is not None:
+            params["dimensions"] = dimensions
+        if user is not None:
+            params["user"] = user
+
+        # Call OpenAI embeddings API
+        response = await self._openai_client.embeddings.create(**params)
+
+        data = []
+        for i, embedding_data in enumerate(response.data):
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_data.embedding,
+                    index=i,
+                )
+            )
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=response.model,
+            usage=usage,
+        )
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index 78ee52641..6cf4680e2 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -19,6 +19,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -210,6 +211,16 @@ class PassthroughInferenceAdapter(Inference):
             task_type=task_type,
         )
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 2706aa15e..f8c98893e 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -8,6 +8,7 @@ from collections.abc import AsyncGenerator
 from openai import OpenAI
 
 from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.inference.inference import OpenAIEmbeddingsResponse
 
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
@@ -134,3 +135,13 @@ class RunpodInferenceAdapter(
         task_type: Optional[EmbeddingTaskType] = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index d182aa1dc..20f863665 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -218,7 +218,7 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
                 "json_schema": {
                     "name": name,
                     "schema": fmt,
-                    "strict": True,
+                    "strict": False,
                 },
             }
         if request.tools:
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 8f6666462..292d74ef8 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
@@ -291,6 +292,16 @@ class _HfAdapter(
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
 
 class TGIAdapter(_HfAdapter):
     async def initialize(self, config: TGIImplConfig) -> None:
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 562e6e0ff..7305a638d 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
@@ -267,6 +268,16 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         embeddings = [item.embedding for item in r.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index fe2d8bec1..9f38d9abf 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -507,6 +508,16 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index c1299e11f..59f5f5562 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -260,6 +261,16 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
     ) -> EmbeddingsResponse:
         raise NotImplementedError("embedding is not supported for watsonx")
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 7c8144c62..97cf87360 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -4,7 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import base64
 import logging
+import struct
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -15,6 +17,9 @@ from llama_stack.apis.inference import (
     EmbeddingTaskType,
     InterleavedContentItem,
     ModelStore,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     TextTruncation,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
@@ -43,6 +48,50 @@ class SentenceTransformerEmbeddingMixin:
         )
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        # Convert input to list format if it's a single string
+        input_list = [input] if isinstance(input, str) else input
+        if not input_list:
+            raise ValueError("Empty list not supported")
+
+        # Get the model and generate embeddings
+        model_obj = await self.model_store.get_model(model)
+        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+
+        # Convert embeddings to the requested format
+        data = []
+        for i, embedding in enumerate(embeddings):
+            if encoding_format == "base64":
+                # Convert float array to base64 string
+                float_bytes = struct.pack(f"{len(embedding)}f", *embedding)
+                embedding_value = base64.b64encode(float_bytes).decode("ascii")
+            else:
+                # Default to float format
+                embedding_value = embedding.tolist()
+
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_value,
+                    index=i,
+                )
+            )
+
+        # Not returning actual token usage
+        usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=model_obj.provider_resource_id,
+            usage=usage,
+        )
+
     def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
         global EMBEDDING_MODELS
 
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index 4d17db21e..dab10bc55 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import base64
+import struct
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
 
@@ -35,6 +37,9 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -264,6 +269,52 @@ class LiteLLMOpenAIMixin(
         embeddings = [data["embedding"] for data in response["data"]]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        model_obj = await self.model_store.get_model(model)
+
+        # Convert input to list if it's a string
+        input_list = [input] if isinstance(input, str) else input
+
+        # Call litellm embedding function
+        # litellm.drop_params = True
+        response = litellm.embedding(
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
+            input=input_list,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
+            dimensions=dimensions,
+        )
+
+        # Convert response to OpenAI format
+        data = []
+        for i, embedding_data in enumerate(response["data"]):
+            # we encode to base64 if the encoding format is base64 in the request
+            if encoding_format == "base64":
+                byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
+                embedding = base64.b64encode(byte_data).decode("utf-8")
+            else:
+                embedding = embedding_data["embedding"]
+
+            data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response["usage"]["prompt_tokens"],
+            total_tokens=response["usage"]["total_tokens"],
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=model_obj.provider_resource_id,
+            usage=usage,
+        )
+
     async def openai_completion(
         self,
         model: str,
diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py
index e9aac6e8c..e966e13ba 100644
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@@ -36,6 +36,10 @@ class RedisKVStoreConfig(CommonConfig):
     def url(self) -> str:
         return f"redis://{self.host}:{self.port}"
 
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["redis"]
+
     @classmethod
     def sample_run_config(cls):
         return {
@@ -53,6 +57,10 @@ class SqliteKVStoreConfig(CommonConfig):
         description="File path for the sqlite database",
     )
 
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["aiosqlite"]
+
     @classmethod
     def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"):
         return {
@@ -65,22 +73,22 @@ class SqliteKVStoreConfig(CommonConfig):
 class PostgresKVStoreConfig(CommonConfig):
     type: Literal[KVStoreType.postgres.value] = KVStoreType.postgres.value
     host: str = "localhost"
-    port: int = 5432
+    port: str = "5432"
     db: str = "llamastack"
     user: str
     password: str | None = None
     table_name: str = "llamastack_kvstore"
 
     @classmethod
-    def sample_run_config(cls, table_name: str = "llamastack_kvstore"):
+    def sample_run_config(cls, table_name: str = "llamastack_kvstore", **kwargs):
         return {
             "type": "postgres",
             "namespace": None,
             "host": "${env.POSTGRES_HOST:localhost}",
             "port": "${env.POSTGRES_PORT:5432}",
-            "db": "${env.POSTGRES_DB}",
-            "user": "${env.POSTGRES_USER}",
-            "password": "${env.POSTGRES_PASSWORD}",
+            "db": "${env.POSTGRES_DB:llamastack}",
+            "user": "${env.POSTGRES_USER:llamastack}",
+            "password": "${env.POSTGRES_PASSWORD:llamastack}",
             "table_name": "${env.POSTGRES_TABLE_NAME:" + table_name + "}",
         }
 
@@ -100,6 +108,10 @@ class PostgresKVStoreConfig(CommonConfig):
             raise ValueError("Table name must be less than 63 characters")
         return v
 
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["psycopg2-binary"]
+
 
 class MongoDBKVStoreConfig(CommonConfig):
     type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value
@@ -110,6 +122,10 @@ class MongoDBKVStoreConfig(CommonConfig):
     password: str | None = None
     collection_name: str = "llamastack_kvstore"
 
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["pymongo"]
+
     @classmethod
     def sample_run_config(cls, collection_name: str = "llamastack_kvstore"):
         return {
diff --git a/llama_stack/providers/utils/kvstore/kvstore.py b/llama_stack/providers/utils/kvstore/kvstore.py
index 3a1ee8a26..426523d8e 100644
--- a/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/llama_stack/providers/utils/kvstore/kvstore.py
@@ -10,6 +10,13 @@ from .config import KVStoreConfig, KVStoreType
 
 
 def kvstore_dependencies():
+    """
+    Returns all possible kvstore dependencies for registry/provider specifications.
+
+    NOTE: For specific kvstore implementations, use config.pip_packages instead.
+    This function returns the union of all dependencies for cases where the specific
+    kvstore type is not known at declaration time (e.g., provider registries).
+    """
     return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"]
 
 
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 3655c7049..4cd15860b 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -171,6 +171,22 @@ def make_overlapped_chunks(
     return chunks
 
 
+def _validate_embedding(embedding: NDArray, index: int, expected_dimension: int):
+    """Helper method to validate embedding format and dimensions"""
+    if not isinstance(embedding, (list | np.ndarray)):
+        raise ValueError(f"Embedding at index {index} must be a list or numpy array, got {type(embedding)}")
+
+    if isinstance(embedding, np.ndarray):
+        if not np.issubdtype(embedding.dtype, np.number):
+            raise ValueError(f"Embedding at index {index} contains non-numeric values")
+    else:
+        if not all(isinstance(e, (float | int | np.number)) for e in embedding):
+            raise ValueError(f"Embedding at index {index} contains non-numeric values")
+
+    if len(embedding) != expected_dimension:
+        raise ValueError(f"Embedding at index {index} has dimension {len(embedding)}, expected {expected_dimension}")
+
+
 class EmbeddingIndex(ABC):
     @abstractmethod
     async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
@@ -199,11 +215,22 @@ class VectorDBWithIndex:
         self,
         chunks: list[Chunk],
     ) -> None:
-        embeddings_response = await self.inference_api.embeddings(
-            self.vector_db.embedding_model, [x.content for x in chunks]
-        )
-        embeddings = np.array(embeddings_response.embeddings)
+        chunks_to_embed = []
+        for i, c in enumerate(chunks):
+            if c.embedding is None:
+                chunks_to_embed.append(c)
+            else:
+                _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)
 
+        if chunks_to_embed:
+            resp = await self.inference_api.embeddings(
+                self.vector_db.embedding_model,
+                [c.content for c in chunks_to_embed],
+            )
+            for c, embedding in zip(chunks_to_embed, resp.embeddings, strict=False):
+                c.embedding = embedding
+
+        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
         await self.index.add_chunks(chunks, embeddings)
 
     async def query_chunks(
diff --git a/llama_stack/providers/utils/sqlstore/sqlite/sqlite.py b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
similarity index 83%
rename from llama_stack/providers/utils/sqlstore/sqlite/sqlite.py
rename to llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
index 0ef5f0fa1..825220679 100644
--- a/llama_stack/providers/utils/sqlstore/sqlite/sqlite.py
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@@ -19,10 +19,10 @@ from sqlalchemy import (
     Text,
     select,
 )
-from sqlalchemy.ext.asyncio import create_async_engine
+from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 
-from ..api import ColumnDefinition, ColumnType, SqlStore
-from ..sqlstore import SqliteSqlStoreConfig
+from .api import ColumnDefinition, ColumnType, SqlStore
+from .sqlstore import SqlAlchemySqlStoreConfig
 
 TYPE_MAPPING: dict[ColumnType, Any] = {
     ColumnType.INTEGER: Integer,
@@ -35,9 +35,10 @@ TYPE_MAPPING: dict[ColumnType, Any] = {
 }
 
 
-class SqliteSqlStoreImpl(SqlStore):
-    def __init__(self, config: SqliteSqlStoreConfig):
-        self.engine = create_async_engine(config.engine_str)
+class SqlAlchemySqlStoreImpl(SqlStore):
+    def __init__(self, config: SqlAlchemySqlStoreConfig):
+        self.config = config
+        self.async_session = async_sessionmaker(create_async_engine(config.engine_str))
         self.metadata = MetaData()
 
     async def create_table(
@@ -78,13 +79,14 @@ class SqliteSqlStoreImpl(SqlStore):
 
         # Create the table in the database if it doesn't exist
         # checkfirst=True ensures it doesn't try to recreate if it's already there
-        async with self.engine.begin() as conn:
+        engine = create_async_engine(self.config.engine_str)
+        async with engine.begin() as conn:
             await conn.run_sync(self.metadata.create_all, tables=[sqlalchemy_table], checkfirst=True)
 
     async def insert(self, table: str, data: Mapping[str, Any]) -> None:
-        async with self.engine.begin() as conn:
-            await conn.execute(self.metadata.tables[table].insert(), data)
-            await conn.commit()
+        async with self.async_session() as session:
+            await session.execute(self.metadata.tables[table].insert(), data)
+            await session.commit()
 
     async def fetch_all(
         self,
@@ -93,7 +95,7 @@ class SqliteSqlStoreImpl(SqlStore):
         limit: int | None = None,
         order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
     ) -> list[dict[str, Any]]:
-        async with self.engine.begin() as conn:
+        async with self.async_session() as session:
             query = select(self.metadata.tables[table])
             if where:
                 for key, value in where.items():
@@ -117,7 +119,7 @@ class SqliteSqlStoreImpl(SqlStore):
                         query = query.order_by(self.metadata.tables[table].c[name].desc())
                     else:
                         raise ValueError(f"Invalid order '{order_type}' for column '{name}'")
-            result = await conn.execute(query)
+            result = await session.execute(query)
             if result.rowcount == 0:
                 return []
             return [dict(row._mapping) for row in result]
@@ -142,20 +144,20 @@ class SqliteSqlStoreImpl(SqlStore):
         if not where:
             raise ValueError("where is required for update")
 
-        async with self.engine.begin() as conn:
+        async with self.async_session() as session:
             stmt = self.metadata.tables[table].update()
             for key, value in where.items():
                 stmt = stmt.where(self.metadata.tables[table].c[key] == value)
-            await conn.execute(stmt, data)
-            await conn.commit()
+            await session.execute(stmt, data)
+            await session.commit()
 
     async def delete(self, table: str, where: Mapping[str, Any]) -> None:
         if not where:
             raise ValueError("where is required for delete")
 
-        async with self.engine.begin() as conn:
+        async with self.async_session() as session:
             stmt = self.metadata.tables[table].delete()
             for key, value in where.items():
                 stmt = stmt.where(self.metadata.tables[table].c[key] == value)
-            await conn.execute(stmt)
-            await conn.commit()
+            await session.execute(stmt)
+            await session.commit()
diff --git a/llama_stack/providers/utils/sqlstore/sqlstore.py b/llama_stack/providers/utils/sqlstore/sqlstore.py
index 99f64805f..edc7672a3 100644
--- a/llama_stack/providers/utils/sqlstore/sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/sqlstore.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 
+from abc import abstractmethod
 from enum import Enum
 from pathlib import Path
 from typing import Annotated, Literal
@@ -15,13 +16,26 @@ from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 
 from .api import SqlStore
 
+sql_store_pip_packages = ["sqlalchemy[asyncio]", "aiosqlite", "asyncpg"]
+
 
 class SqlStoreType(Enum):
     sqlite = "sqlite"
     postgres = "postgres"
 
 
-class SqliteSqlStoreConfig(BaseModel):
+class SqlAlchemySqlStoreConfig(BaseModel):
+    @property
+    @abstractmethod
+    def engine_str(self) -> str: ...
+
+    # TODO: move this when we have a better way to specify dependencies with internal APIs
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["sqlalchemy[asyncio]"]
+
+
+class SqliteSqlStoreConfig(SqlAlchemySqlStoreConfig):
     type: Literal["sqlite"] = SqlStoreType.sqlite.value
     db_path: str = Field(
         default=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
@@ -39,18 +53,37 @@ class SqliteSqlStoreConfig(BaseModel):
             db_path="${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
         )
 
-    # TODO: move this when we have a better way to specify dependencies with internal APIs
     @property
     def pip_packages(self) -> list[str]:
-        return ["sqlalchemy[asyncio]"]
+        return super().pip_packages + ["aiosqlite"]
 
 
-class PostgresSqlStoreConfig(BaseModel):
+class PostgresSqlStoreConfig(SqlAlchemySqlStoreConfig):
     type: Literal["postgres"] = SqlStoreType.postgres.value
+    host: str = "localhost"
+    port: str = "5432"
+    db: str = "llamastack"
+    user: str
+    password: str | None = None
+
+    @property
+    def engine_str(self) -> str:
+        return f"postgresql+asyncpg://{self.user}:{self.password}@{self.host}:{self.port}/{self.db}"
 
     @property
     def pip_packages(self) -> list[str]:
-        raise NotImplementedError("Postgres is not implemented yet")
+        return super().pip_packages + ["asyncpg"]
+
+    @classmethod
+    def sample_run_config(cls, **kwargs):
+        return cls(
+            type="postgres",
+            host="${env.POSTGRES_HOST:localhost}",
+            port="${env.POSTGRES_PORT:5432}",
+            db="${env.POSTGRES_DB:llamastack}",
+            user="${env.POSTGRES_USER:llamastack}",
+            password="${env.POSTGRES_PASSWORD:llamastack}",
+        )
 
 
 SqlStoreConfig = Annotated[
@@ -60,12 +93,10 @@ SqlStoreConfig = Annotated[
 
 
 def sqlstore_impl(config: SqlStoreConfig) -> SqlStore:
-    if config.type == SqlStoreType.sqlite.value:
-        from .sqlite.sqlite import SqliteSqlStoreImpl
+    if config.type in [SqlStoreType.sqlite.value, SqlStoreType.postgres.value]:
+        from .sqlalchemy_sqlstore import SqlAlchemySqlStoreImpl
 
-        impl = SqliteSqlStoreImpl(config)
-    elif config.type == SqlStoreType.postgres.value:
-        raise NotImplementedError("Postgres is not implemented yet")
+        impl = SqlAlchemySqlStoreImpl(config)
     else:
         raise ValueError(f"Unknown sqlstore type {config.type}")
 
diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml
index 09fbf307d..97a06f77a 100644
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@@ -30,4 +30,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index a58068a60..8033b2086 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -42,7 +42,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/trace_store.db
   eval:
diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml
index 95b0302f2..f26f4ed9b 100644
--- a/llama_stack/templates/cerebras/build.yaml
+++ b/llama_stack/templates/cerebras/build.yaml
@@ -30,4 +30,5 @@ distribution_spec:
     - inline::rag-runtime
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index c080536b7..490648302 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -82,7 +82,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/trace_store.db
   tool_runtime:
diff --git a/llama_stack/templates/ci-tests/build.yaml b/llama_stack/templates/ci-tests/build.yaml
index 6fe96c603..9f4fbbdda 100644
--- a/llama_stack/templates/ci-tests/build.yaml
+++ b/llama_stack/templates/ci-tests/build.yaml
@@ -31,4 +31,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml
index 368187d3a..92497b0bf 100644
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@@ -45,7 +45,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/trace_store.db
   eval:
diff --git a/llama_stack/templates/dell/build.yaml b/llama_stack/templates/dell/build.yaml
index d37215f35..513df16c1 100644
--- a/llama_stack/templates/dell/build.yaml
+++ b/llama_stack/templates/dell/build.yaml
@@ -31,5 +31,5 @@ distribution_spec:
     - inline::rag-runtime
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index 5c6072245..22cf1fd24 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
   eval:
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index ffaa0bf2f..aeca2fc26 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -44,7 +44,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
   eval:
diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml
index f162d9b43..53b47da41 100644
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@@ -24,6 +24,8 @@ distribution_spec:
     - inline::basic
     - inline::llm-as-judge
     - inline::braintrust
+    files:
+    - inline::localfs
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
@@ -32,5 +34,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py
index da68475e2..5e8935361 100644
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
     ShieldInput,
     ToolGroupInput,
 )
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
     SentenceTransformersInferenceConfig,
 )
@@ -36,6 +37,7 @@ def get_distribution_template() -> DistributionTemplate:
         "eval": ["inline::meta-reference"],
         "datasetio": ["remote::huggingface", "inline::localfs"],
         "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "files": ["inline::localfs"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
@@ -62,6 +64,11 @@ def get_distribution_template() -> DistributionTemplate:
         provider_type="inline::faiss",
         config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
     )
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
 
     available_models = {
         "fireworks": MODEL_ENTRIES,
@@ -104,6 +111,7 @@ def get_distribution_template() -> DistributionTemplate:
                 provider_overrides={
                     "inference": [inference_provider, embedding_provider],
                     "vector_io": [vector_io_provider],
+                    "files": [files_provider],
                 },
                 default_models=default_models + [embedding_model],
                 default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
@@ -116,6 +124,7 @@ def get_distribution_template() -> DistributionTemplate:
                         embedding_provider,
                     ],
                     "vector_io": [vector_io_provider],
+                    "files": [files_provider],
                     "safety": [
                         Provider(
                             provider_id="llama-guard",
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 41500f6f6..302328486 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - safety
 - scoring
@@ -53,7 +54,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
   eval:
@@ -90,6 +91,14 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/fireworks/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/files_metadata.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index b1fa03306..a31ed732b 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - safety
 - scoring
@@ -48,7 +49,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
   eval:
@@ -85,6 +86,14 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/fireworks/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/files_metadata.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/groq/build.yaml b/llama_stack/templates/groq/build.yaml
index 92b46ce66..819df22f0 100644
--- a/llama_stack/templates/groq/build.yaml
+++ b/llama_stack/templates/groq/build.yaml
@@ -27,4 +27,5 @@ distribution_spec:
     - inline::rag-runtime
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml
index db7ebffee..7f1912a6f 100644
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/trace_store.db
   eval:
@@ -112,7 +112,7 @@ models:
   provider_model_id: groq/llama3-8b-8192
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
+  model_id: groq/meta-llama/Llama-3.1-8B-Instruct
   provider_id: groq
   provider_model_id: groq/llama3-8b-8192
   model_type: llm
@@ -127,7 +127,7 @@ models:
   provider_model_id: groq/llama3-70b-8192
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
+  model_id: groq/meta-llama/Llama-3-70B-Instruct
   provider_id: groq
   provider_model_id: groq/llama3-70b-8192
   model_type: llm
@@ -137,7 +137,7 @@ models:
   provider_model_id: groq/llama-3.3-70b-versatile
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
+  model_id: groq/meta-llama/Llama-3.3-70B-Instruct
   provider_id: groq
   provider_model_id: groq/llama-3.3-70b-versatile
   model_type: llm
@@ -147,7 +147,7 @@ models:
   provider_model_id: groq/llama-3.2-3b-preview
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: groq/meta-llama/Llama-3.2-3B-Instruct
   provider_id: groq
   provider_model_id: groq/llama-3.2-3b-preview
   model_type: llm
@@ -157,7 +157,7 @@ models:
   provider_model_id: groq/llama-4-scout-17b-16e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: groq
   provider_model_id: groq/llama-4-scout-17b-16e-instruct
   model_type: llm
@@ -167,7 +167,7 @@ models:
   provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
   model_type: llm
@@ -177,7 +177,7 @@ models:
   provider_model_id: groq/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: groq
   provider_model_id: groq/llama-4-maverick-17b-128e-instruct
   model_type: llm
@@ -187,7 +187,7 @@ models:
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml
index 4d09cc33e..8ede83694 100644
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@@ -30,5 +30,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 15cf2a47f..8b00f4ba5 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
   eval:
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 428edf9a2..8a9cd5c49 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
   eval:
diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml
index d06c628ac..d0752db9a 100644
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@@ -31,5 +31,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index ab461c6c3..fec64c1df 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
   eval:
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index d238506fb..d4a6286d7 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
   eval:
diff --git a/llama_stack/templates/llama_api/build.yaml b/llama_stack/templates/llama_api/build.yaml
index d0dc08923..857e5f014 100644
--- a/llama_stack/templates/llama_api/build.yaml
+++ b/llama_stack/templates/llama_api/build.yaml
@@ -31,4 +31,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/llama_api/run.yaml b/llama_stack/templates/llama_api/run.yaml
index a7f2b0769..2185eb4fc 100644
--- a/llama_stack/templates/llama_api/run.yaml
+++ b/llama_stack/templates/llama_api/run.yaml
@@ -57,7 +57,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/trace_store.db
   eval:
diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml
index e0ac87e47..53ad411e3 100644
--- a/llama_stack/templates/meta-reference-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@@ -30,5 +30,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index 2b751a514..e65445a9e 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -63,7 +63,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
   eval:
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index a24c5fec5..8ef02f14d 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
   eval:
diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml
index e1e6fb3d8..6bd8a0100 100644
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@@ -25,5 +25,5 @@ distribution_spec:
     - inline::rag-runtime
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml
index eeccc006a..eebfa1066 100644
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
   eval:
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index cd36ec362..be0e3f6d1 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
   eval:
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index 9d8ba3a1e..36a120897 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -33,5 +33,5 @@ distribution_spec:
     - remote::wolfram-alpha
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index d63c5e366..7bf9fc3bd 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -47,7 +47,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
   eval:
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index d208cd7f0..0030bcd60 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -45,7 +45,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
   eval:
diff --git a/llama_stack/templates/open-benchmark/build.yaml b/llama_stack/templates/open-benchmark/build.yaml
index aa6d876fe..840f1e1db 100644
--- a/llama_stack/templates/open-benchmark/build.yaml
+++ b/llama_stack/templates/open-benchmark/build.yaml
@@ -34,4 +34,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 0e5edf728..051ca6f8e 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -71,7 +71,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/trace_store.db
   eval:
diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml
index 7560f1032..46b99cb75 100644
--- a/llama_stack/templates/passthrough/build.yaml
+++ b/llama_stack/templates/passthrough/build.yaml
@@ -32,5 +32,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml
index bbf5d9a52..3168eeb9f 100644
--- a/llama_stack/templates/passthrough/run-with-safety.yaml
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
   eval:
diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml
index 146906d9b..48abf8577 100644
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
   eval:
diff --git a/llama_stack/templates/verification/__init__.py b/llama_stack/templates/postgres-demo/__init__.py
similarity index 75%
rename from llama_stack/templates/verification/__init__.py
rename to llama_stack/templates/postgres-demo/__init__.py
index 5d8c281a6..81473cb73 100644
--- a/llama_stack/templates/verification/__init__.py
+++ b/llama_stack/templates/postgres-demo/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .verification import get_distribution_template  # noqa: F401
+from .postgres_demo import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/postgres-demo/build.yaml b/llama_stack/templates/postgres-demo/build.yaml
new file mode 100644
index 000000000..6416cd00f
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/build.yaml
@@ -0,0 +1,25 @@
+version: '2'
+distribution_spec:
+  description: Quick start template for running Llama Stack with several popular providers
+  providers:
+    inference:
+    - remote::vllm
+    - inline::sentence-transformers
+    vector_io:
+    - remote::chromadb
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
+additional_pip_packages:
+- asyncpg
+- psycopg2-binary
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/postgres-demo/postgres_demo.py b/llama_stack/templates/postgres-demo/postgres_demo.py
new file mode 100644
index 000000000..759281567
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/postgres_demo.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import SentenceTransformersInferenceConfig
+from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
+from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
+from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+)
+
+
+def get_distribution_template() -> DistributionTemplate:
+    inference_providers = [
+        Provider(
+            provider_id="vllm-inference",
+            provider_type="remote::vllm",
+            config=VLLMInferenceAdapterConfig.sample_run_config(
+                url="${env.VLLM_URL:http://localhost:8000/v1}",
+            ),
+        ),
+    ]
+    providers = {
+        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
+        "vector_io": ["remote::chromadb"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "postgres-demo"
+
+    vector_io_providers = [
+        Provider(
+            provider_id="${env.ENABLE_CHROMADB+chromadb}",
+            provider_type="remote::chromadb",
+            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
+        ),
+    ]
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+    ]
+
+    default_models = [
+        ModelInput(
+            model_id="${env.INFERENCE_MODEL}",
+            provider_id="vllm-inference",
+        )
+    ]
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id=embedding_provider.provider_id,
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+    postgres_config = PostgresSqlStoreConfig.sample_run_config()
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Quick start template for running Llama Stack with several popular providers",
+        container_image=None,
+        template_path=None,
+        providers=providers,
+        available_models_by_provider={},
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": inference_providers + [embedding_provider],
+                    "vector_io": vector_io_providers,
+                    "agents": [
+                        Provider(
+                            provider_id="meta-reference",
+                            provider_type="inline::meta-reference",
+                            config=dict(
+                                persistence_store=postgres_config,
+                                responses_store=postgres_config,
+                            ),
+                        )
+                    ],
+                    "telemetry": [
+                        Provider(
+                            provider_id="meta-reference",
+                            provider_type="inline::meta-reference",
+                            config=dict(
+                                service_name="${env.OTEL_SERVICE_NAME:}",
+                                sinks="${env.TELEMETRY_SINKS:console,otel_trace}",
+                                otel_trace_endpoint="${env.OTEL_TRACE_ENDPOINT:http://localhost:4318/v1/traces}",
+                            ),
+                        )
+                    ],
+                },
+                default_models=default_models + [embedding_model],
+                default_tool_groups=default_tool_groups,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                metadata_store=PostgresKVStoreConfig.sample_run_config(),
+                inference_store=postgres_config,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/postgres-demo/run.yaml b/llama_stack/templates/postgres-demo/run.yaml
new file mode 100644
index 000000000..0e0d020b2
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/run.yaml
@@ -0,0 +1,111 @@
+version: '2'
+image_name: postgres-demo
+apis:
+- agents
+- inference
+- safety
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console,otel_trace}
+      otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:http://localhost:4318/v1/traces}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+  table_name: ${env.POSTGRES_TABLE_NAME:llamastack_kvstore}
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index fcd4deeff..16fe5d4fd 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -32,5 +32,5 @@ distribution_spec:
     - remote::wolfram-alpha
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index e83162a4f..64f71087a 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -91,7 +91,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
   tool_runtime:
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 4cdf88c6b..353b9902d 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -84,7 +84,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
   tool_runtime:
diff --git a/llama_stack/templates/sambanova/build.yaml b/llama_stack/templates/sambanova/build.yaml
index b644dcfdc..14b1c8974 100644
--- a/llama_stack/templates/sambanova/build.yaml
+++ b/llama_stack/templates/sambanova/build.yaml
@@ -23,4 +23,5 @@ distribution_spec:
     - remote::wolfram-alpha
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 8c2a933ab..58d0d36e3 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -58,7 +58,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/trace_store.db
   tool_runtime:
diff --git a/llama_stack/templates/starter/build.yaml b/llama_stack/templates/starter/build.yaml
index 652814ffd..5fd3cc3f5 100644
--- a/llama_stack/templates/starter/build.yaml
+++ b/llama_stack/templates/starter/build.yaml
@@ -5,10 +5,13 @@ distribution_spec:
     inference:
     - remote::openai
     - remote::fireworks
+    - remote::together
+    - remote::ollama
     - remote::anthropic
     - remote::gemini
     - remote::groq
     - remote::sambanova
+    - remote::vllm
     - inline::sentence-transformers
     vector_io:
     - inline::sqlite-vec
@@ -36,4 +39,6 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
+- asyncpg
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml
index 04425ed35..4732afa77 100644
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@@ -21,6 +21,15 @@ providers:
     config:
       url: https://api.fireworks.ai/inference/v1
       api_key: ${env.FIREWORKS_API_KEY:}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:}
+  - provider_id: ollama
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_URL:http://localhost:11434}
   - provider_id: anthropic
     provider_type: remote::anthropic
     config:
@@ -39,6 +48,13 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY:}
+  - provider_id: vllm
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
@@ -79,7 +95,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/trace_store.db
   eval:
@@ -156,72 +172,72 @@ models:
   provider_model_id: openai/chatgpt-4o-latest
   model_type: llm
 - metadata: {}
-  model_id: gpt-3.5-turbo-0125
+  model_id: openai/gpt-3.5-turbo-0125
   provider_id: openai
   provider_model_id: gpt-3.5-turbo-0125
   model_type: llm
 - metadata: {}
-  model_id: gpt-3.5-turbo
+  model_id: openai/gpt-3.5-turbo
   provider_id: openai
   provider_model_id: gpt-3.5-turbo
   model_type: llm
 - metadata: {}
-  model_id: gpt-3.5-turbo-instruct
+  model_id: openai/gpt-3.5-turbo-instruct
   provider_id: openai
   provider_model_id: gpt-3.5-turbo-instruct
   model_type: llm
 - metadata: {}
-  model_id: gpt-4
+  model_id: openai/gpt-4
   provider_id: openai
   provider_model_id: gpt-4
   model_type: llm
 - metadata: {}
-  model_id: gpt-4-turbo
+  model_id: openai/gpt-4-turbo
   provider_id: openai
   provider_model_id: gpt-4-turbo
   model_type: llm
 - metadata: {}
-  model_id: gpt-4o
+  model_id: openai/gpt-4o
   provider_id: openai
   provider_model_id: gpt-4o
   model_type: llm
 - metadata: {}
-  model_id: gpt-4o-2024-08-06
+  model_id: openai/gpt-4o-2024-08-06
   provider_id: openai
   provider_model_id: gpt-4o-2024-08-06
   model_type: llm
 - metadata: {}
-  model_id: gpt-4o-mini
+  model_id: openai/gpt-4o-mini
   provider_id: openai
   provider_model_id: gpt-4o-mini
   model_type: llm
 - metadata: {}
-  model_id: gpt-4o-audio-preview
+  model_id: openai/gpt-4o-audio-preview
   provider_id: openai
   provider_model_id: gpt-4o-audio-preview
   model_type: llm
 - metadata: {}
-  model_id: chatgpt-4o-latest
+  model_id: openai/chatgpt-4o-latest
   provider_id: openai
   provider_model_id: chatgpt-4o-latest
   model_type: llm
 - metadata: {}
-  model_id: o1
+  model_id: openai/o1
   provider_id: openai
   provider_model_id: o1
   model_type: llm
 - metadata: {}
-  model_id: o1-mini
+  model_id: openai/o1-mini
   provider_id: openai
   provider_model_id: o1-mini
   model_type: llm
 - metadata: {}
-  model_id: o3-mini
+  model_id: openai/o3-mini
   provider_id: openai
   provider_model_id: o3-mini
   model_type: llm
 - metadata: {}
-  model_id: o4-mini
+  model_id: openai/o4-mini
   provider_id: openai
   provider_model_id: o4-mini
   model_type: llm
@@ -242,14 +258,14 @@ models:
 - metadata:
     embedding_dimension: 1536
     context_length: 8192
-  model_id: text-embedding-3-small
+  model_id: openai/text-embedding-3-small
   provider_id: openai
   provider_model_id: text-embedding-3-small
   model_type: embedding
 - metadata:
     embedding_dimension: 3072
     context_length: 8192
-  model_id: text-embedding-3-large
+  model_id: openai/text-embedding-3-large
   provider_id: openai
   provider_model_id: text-embedding-3-large
   model_type: embedding
@@ -259,7 +275,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
+  model_id: fireworks/meta-llama/Llama-3.1-8B-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   model_type: llm
@@ -269,7 +285,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
+  model_id: fireworks/meta-llama/Llama-3.1-70B-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
   model_type: llm
@@ -279,7 +295,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  model_id: fireworks/meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
@@ -289,7 +305,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: fireworks/meta-llama/Llama-3.2-3B-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   model_type: llm
@@ -299,7 +315,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  model_id: fireworks/meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
   model_type: llm
@@ -309,7 +325,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  model_id: fireworks/meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
   model_type: llm
@@ -319,7 +335,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
+  model_id: fireworks/meta-llama/Llama-3.3-70B-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
   model_type: llm
@@ -329,7 +345,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-guard-3-8b
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
+  model_id: fireworks/meta-llama/Llama-Guard-3-8B
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-guard-3-8b
   model_type: llm
@@ -339,7 +355,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
+  model_id: fireworks/meta-llama/Llama-Guard-3-11B-Vision
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
   model_type: llm
@@ -349,7 +365,7 @@ models:
   provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: fireworks/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
   model_type: llm
@@ -359,17 +375,307 @@ models:
   provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: fireworks/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
   model_type: llm
 - metadata:
     embedding_dimension: 768
     context_length: 8192
-  model_id: nomic-ai/nomic-embed-text-v1.5
+  model_id: fireworks/nomic-ai/nomic-embed-text-v1.5
   provider_id: fireworks
   provider_model_id: nomic-ai/nomic-embed-text-v1.5
   model_type: embedding
+- metadata: {}
+  model_id: together/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.1-8B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.1-70B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-3B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-3B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-3.3-70B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Meta-Llama-Guard-3-8B
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-Guard-3-8B
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: together
+  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  model_type: llm
+- metadata:
+    embedding_dimension: 768
+    context_length: 8192
+  model_id: togethercomputer/m2-bert-80M-8k-retrieval
+  provider_id: together
+  provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval
+  model_type: embedding
+- metadata:
+    embedding_dimension: 768
+    context_length: 32768
+  model_id: togethercomputer/m2-bert-80M-32k-retrieval
+  provider_id: together
+  provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval
+  model_type: embedding
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  provider_id: together
+  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:8b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.1:8b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.1-8B-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.1:8b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:8b
+  provider_id: ollama
+  provider_model_id: llama3.1:8b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:70b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.1:70b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.1-70B-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.1:70b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:70b
+  provider_id: ollama
+  provider_model_id: llama3.1:70b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:405b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.1:405b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: ollama
+  provider_model_id: llama3.1:405b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.1:405b
+  provider_id: ollama
+  provider_model_id: llama3.1:405b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2:1b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.2:1b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.2-1B-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.2:1b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2:1b
+  provider_id: ollama
+  provider_model_id: llama3.2:1b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2:3b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.2:3b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.2-3B-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.2:3b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2:3b
+  provider_id: ollama
+  provider_model_id: llama3.2:3b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2-vision:11b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:11b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:11b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2-vision:latest
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:latest
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2-vision:90b-instruct-fp16
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:90b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:90b-instruct-fp16
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.2-vision:90b
+  provider_id: ollama
+  provider_model_id: llama3.2-vision:90b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama3.3:70b
+  provider_id: ollama
+  provider_model_id: llama3.3:70b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-3.3-70B-Instruct
+  provider_id: ollama
+  provider_model_id: llama3.3:70b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama-guard3:8b
+  provider_id: ollama
+  provider_model_id: llama-guard3:8b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-Guard-3-8B
+  provider_id: ollama
+  provider_model_id: llama-guard3:8b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/llama-guard3:1b
+  provider_id: ollama
+  provider_model_id: llama-guard3:1b
+  model_type: llm
+- metadata: {}
+  model_id: ollama/meta-llama/Llama-Guard-3-1B
+  provider_id: ollama
+  provider_model_id: llama-guard3:1b
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+    context_length: 512
+  model_id: ollama/all-minilm:latest
+  provider_id: ollama
+  provider_model_id: all-minilm:latest
+  model_type: embedding
+- metadata:
+    embedding_dimension: 384
+    context_length: 512
+  model_id: ollama/all-minilm
+  provider_id: ollama
+  provider_model_id: all-minilm:latest
+  model_type: embedding
+- metadata:
+    embedding_dimension: 768
+    context_length: 8192
+  model_id: ollama/nomic-embed-text
+  provider_id: ollama
+  provider_model_id: nomic-embed-text
+  model_type: embedding
 - metadata: {}
   model_id: anthropic/claude-3-5-sonnet-latest
   provider_id: anthropic
@@ -429,7 +735,7 @@ models:
   provider_model_id: groq/llama3-8b-8192
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
+  model_id: groq/meta-llama/Llama-3.1-8B-Instruct
   provider_id: groq
   provider_model_id: groq/llama3-8b-8192
   model_type: llm
@@ -444,7 +750,7 @@ models:
   provider_model_id: groq/llama3-70b-8192
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
+  model_id: groq/meta-llama/Llama-3-70B-Instruct
   provider_id: groq
   provider_model_id: groq/llama3-70b-8192
   model_type: llm
@@ -454,7 +760,7 @@ models:
   provider_model_id: groq/llama-3.3-70b-versatile
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
+  model_id: groq/meta-llama/Llama-3.3-70B-Instruct
   provider_id: groq
   provider_model_id: groq/llama-3.3-70b-versatile
   model_type: llm
@@ -464,7 +770,7 @@ models:
   provider_model_id: groq/llama-3.2-3b-preview
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: groq/meta-llama/Llama-3.2-3B-Instruct
   provider_id: groq
   provider_model_id: groq/llama-3.2-3b-preview
   model_type: llm
@@ -474,7 +780,7 @@ models:
   provider_model_id: groq/llama-4-scout-17b-16e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: groq
   provider_model_id: groq/llama-4-scout-17b-16e-instruct
   model_type: llm
@@ -484,7 +790,7 @@ models:
   provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
   model_type: llm
@@ -494,7 +800,7 @@ models:
   provider_model_id: groq/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: groq
   provider_model_id: groq/llama-4-maverick-17b-128e-instruct
   model_type: llm
@@ -504,7 +810,7 @@ models:
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
@@ -514,7 +820,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
+  model_id: sambanova/meta-llama/Llama-3.1-8B-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
@@ -524,7 +830,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  model_id: sambanova/meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
@@ -534,7 +840,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
+  model_id: sambanova/meta-llama/Llama-3.2-1B-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
@@ -544,7 +850,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: sambanova/meta-llama/Llama-3.2-3B-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
@@ -554,7 +860,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
+  model_id: sambanova/meta-llama/Llama-3.3-70B-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
@@ -564,7 +870,7 @@ models:
   provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  model_id: sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
@@ -574,7 +880,7 @@ models:
   provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  model_id: sambanova/meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
@@ -584,7 +890,7 @@ models:
   provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  model_id: sambanova/meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
@@ -594,7 +900,7 @@ models:
   provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  model_id: sambanova/meta-llama/Llama-4-Maverick-17B-128E-Instruct
   provider_id: sambanova
   provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
   model_type: llm
@@ -604,7 +910,7 @@ models:
   provider_model_id: sambanova/Meta-Llama-Guard-3-8B
   model_type: llm
 - metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
+  model_id: sambanova/meta-llama/Llama-Guard-3-8B
   provider_id: sambanova
   provider_model_id: sambanova/Meta-Llama-Guard-3-8B
   model_type: llm
diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py
index 0932bfdfe..650ecc87f 100644
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@@ -34,6 +34,10 @@ from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.remote.inference.groq.models import (
     MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
 )
+from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
+from llama_stack.providers.remote.inference.ollama.models import (
+    MODEL_ENTRIES as OLLAMA_MODEL_ENTRIES,
+)
 from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
 from llama_stack.providers.remote.inference.openai.models import (
     MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
@@ -42,11 +46,17 @@ from llama_stack.providers.remote.inference.sambanova.config import SambaNovaImp
 from llama_stack.providers.remote.inference.sambanova.models import (
     MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES,
 )
+from llama_stack.providers.remote.inference.together.config import TogetherImplConfig
+from llama_stack.providers.remote.inference.together.models import (
+    MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES,
+)
+from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.remote.vector_io.pgvector.config import (
     PGVectorVectorIOConfig,
 )
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
 from llama_stack.templates.template import (
     DistributionTemplate,
     RunConfigSettings,
@@ -67,6 +77,16 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
             FIREWORKS_MODEL_ENTRIES,
             FireworksImplConfig.sample_run_config(api_key="${env.FIREWORKS_API_KEY:}"),
         ),
+        (
+            "together",
+            TOGETHER_MODEL_ENTRIES,
+            TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"),
+        ),
+        (
+            "ollama",
+            OLLAMA_MODEL_ENTRIES,
+            OllamaImplConfig.sample_run_config(),
+        ),
         (
             "anthropic",
             ANTHROPIC_MODEL_ENTRIES,
@@ -87,6 +107,13 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
             SAMBANOVA_MODEL_ENTRIES,
             SambaNovaImplConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:}"),
         ),
+        (
+            "vllm",
+            [],
+            VLLMInferenceAdapterConfig.sample_run_config(
+                url="${env.VLLM_URL:http://localhost:8000/v1}",
+            ),
+        ),
     ]
     inference_providers = []
     available_models = {}
@@ -169,6 +196,8 @@ def get_distribution_template() -> DistributionTemplate:
     )
 
     default_models = get_model_registry(available_models)
+
+    postgres_store = PostgresSqlStoreConfig.sample_run_config()
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
@@ -177,6 +206,7 @@ def get_distribution_template() -> DistributionTemplate:
         template_path=None,
         providers=providers,
         available_models_by_provider=available_models,
+        additional_pip_packages=postgres_store.pip_packages,
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
@@ -201,5 +231,25 @@ def get_distribution_template() -> DistributionTemplate:
                 "",
                 "OpenAI API Key",
             ),
+            "GROQ_API_KEY": (
+                "",
+                "Groq API Key",
+            ),
+            "ANTHROPIC_API_KEY": (
+                "",
+                "Anthropic API Key",
+            ),
+            "GEMINI_API_KEY": (
+                "",
+                "Gemini API Key",
+            ),
+            "SAMBANOVA_API_KEY": (
+                "",
+                "SambaNova API Key",
+            ),
+            "VLLM_URL": (
+                "http://localhost:8000/v1",
+                "VLLM URL",
+            ),
         },
     )
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index ec5cd38ea..712d2dcb4 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -8,6 +8,7 @@ from pathlib import Path
 from typing import Literal
 
 import jinja2
+import rich
 import yaml
 from pydantic import BaseModel, Field
 
@@ -28,21 +29,43 @@ from llama_stack.distribution.datatypes import (
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
-from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig
 
 
 def get_model_registry(
     available_models: dict[str, list[ProviderModelEntry]],
 ) -> list[ModelInput]:
     models = []
+
+    # check for conflicts in model ids
+    all_ids = set()
+    ids_conflict = False
+
+    for _, entries in available_models.items():
+        for entry in entries:
+            ids = [entry.provider_model_id] + entry.aliases
+            for model_id in ids:
+                if model_id in all_ids:
+                    ids_conflict = True
+                    rich.print(
+                        f"[yellow]Model id {model_id} conflicts; all model ids will be prefixed with provider id[/yellow]"
+                    )
+                    break
+            all_ids.update(ids)
+            if ids_conflict:
+                break
+        if ids_conflict:
+            break
+
     for provider_id, entries in available_models.items():
         for entry in entries:
             ids = [entry.provider_model_id] + entry.aliases
             for model_id in ids:
+                identifier = f"{provider_id}/{model_id}" if ids_conflict and provider_id not in model_id else model_id
                 models.append(
                     ModelInput(
-                        model_id=model_id,
+                        model_id=identifier,
                         provider_model_id=entry.provider_model_id,
                         provider_id=provider_id,
                         model_type=entry.model_type,
@@ -64,6 +87,8 @@ class RunConfigSettings(BaseModel):
     default_tool_groups: list[ToolGroupInput] | None = None
     default_datasets: list[DatasetInput] | None = None
     default_benchmarks: list[BenchmarkInput] | None = None
+    metadata_store: KVStoreConfig | None = None
+    inference_store: SqlStoreConfig | None = None
 
     def run_config(
         self,
@@ -114,11 +139,13 @@ class RunConfigSettings(BaseModel):
             container_image=container_image,
             apis=apis,
             providers=provider_configs,
-            metadata_store=SqliteKVStoreConfig.sample_run_config(
+            metadata_store=self.metadata_store
+            or SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=f"~/.llama/distributions/{name}",
                 db_name="registry.db",
             ),
-            inference_store=SqliteSqlStoreConfig.sample_run_config(
+            inference_store=self.inference_store
+            or SqliteSqlStoreConfig.sample_run_config(
                 __distro_dir__=f"~/.llama/distributions/{name}",
                 db_name="inference_store.db",
             ),
@@ -150,12 +177,26 @@ class DistributionTemplate(BaseModel):
 
     available_models_by_provider: dict[str, list[ProviderModelEntry]] | None = None
 
+    # we may want to specify additional pip packages without necessarily indicating a
+    # specific "default" inference store (which is what typically used to dictate additional
+    # pip packages)
+    additional_pip_packages: list[str] | None = None
+
     def build_config(self) -> BuildConfig:
         additional_pip_packages: list[str] = []
         for run_config in self.run_configs.values():
             run_config_ = run_config.run_config(self.name, self.providers, self.container_image)
+
+            # TODO: This is a hack to get the dependencies for internal APIs into build
+            # We should have a better way to do this by formalizing the concept of "internal" APIs
+            # and providers, with a way to specify dependencies for them.
             if run_config_.inference_store:
                 additional_pip_packages.extend(run_config_.inference_store.pip_packages)
+            if run_config_.metadata_store:
+                additional_pip_packages.extend(run_config_.metadata_store.pip_packages)
+
+        if self.additional_pip_packages:
+            additional_pip_packages.extend(self.additional_pip_packages)
 
         return BuildConfig(
             distribution_spec=DistributionSpec(
@@ -164,7 +205,7 @@ class DistributionTemplate(BaseModel):
                 providers=self.providers,
             ),
             image_type="conda",  # default to conda, can be overridden
-            additional_pip_packages=additional_pip_packages,
+            additional_pip_packages=sorted(set(additional_pip_packages)),
         )
 
     def generate_markdown_docs(self) -> str:
diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml
index 652900c84..361b0b680 100644
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@@ -31,5 +31,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index c797b93aa..22b7bcde6 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
   eval:
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 7e91d20bd..dd012323c 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -47,7 +47,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
   eval:
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index 4a556a66f..5ffeac873 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -32,5 +32,5 @@ distribution_spec:
     - remote::wolfram-alpha
 image_type: conda
 additional_pip_packages:
-- sqlalchemy[asyncio]
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 190a0400b..a24843416 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -53,7 +53,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
   eval:
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index ce9542130..c71f960bd 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -48,7 +48,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
   eval:
diff --git a/llama_stack/templates/verification/build.yaml b/llama_stack/templates/verification/build.yaml
deleted file mode 100644
index cb7ab4798..000000000
--- a/llama_stack/templates/verification/build.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-version: '2'
-distribution_spec:
-  description: Distribution for running e2e tests in CI
-  providers:
-    inference:
-    - remote::openai
-    - remote::fireworks-openai-compat
-    - remote::together-openai-compat
-    - remote::groq-openai-compat
-    - remote::sambanova-openai-compat
-    - remote::cerebras-openai-compat
-    - inline::sentence-transformers
-    vector_io:
-    - inline::sqlite-vec
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
-additional_pip_packages:
-- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml
deleted file mode 100644
index 58b3c576c..000000000
--- a/llama_stack/templates/verification/run.yaml
+++ /dev/null
@@ -1,731 +0,0 @@
-version: '2'
-image_name: verification
-apis:
-- agents
-- datasetio
-- eval
-- inference
-- safety
-- scoring
-- telemetry
-- tool_runtime
-- vector_io
-providers:
-  inference:
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:}
-  - provider_id: fireworks-openai-compat
-    provider_type: remote::fireworks-openai-compat
-    config:
-      openai_compat_api_base: https://api.fireworks.ai/inference/v1
-      api_key: ${env.FIREWORKS_API_KEY:}
-  - provider_id: together-openai-compat
-    provider_type: remote::together-openai-compat
-    config:
-      openai_compat_api_base: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY:}
-  - provider_id: groq-openai-compat
-    provider_type: remote::groq-openai-compat
-    config:
-      openai_compat_api_base: https://api.groq.com/openai/v1
-      api_key: ${env.GROQ_API_KEY:}
-  - provider_id: sambanova-openai-compat
-    provider_type: remote::sambanova-openai-compat
-    config:
-      openai_compat_api_base: https://api.sambanova.ai/v1
-      api_key: ${env.SAMBANOVA_API_KEY:}
-  - provider_id: cerebras-openai-compat
-    provider_type: remote::cerebras-openai-compat
-    config:
-      openai_compat_api_base: https://api.cerebras.ai/v1
-      api_key: ${env.CEREBRAS_API_KEY:}
-  - provider_id: sentence-transformers
-    provider_type: inline::sentence-transformers
-    config: {}
-  vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/sqlite_vec.db
-  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:}
-  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:localhost}
-      port: ${env.PGVECTOR_PORT:5432}
-      db: ${env.PGVECTOR_DB:}
-      user: ${env.PGVECTOR_USER:}
-      password: ${env.PGVECTOR_PASSWORD:}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config:
-      excluded_categories: []
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/agents_store.db
-      responses_store:
-        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/responses_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
-      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/trace_store.db
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/meta_reference_eval.db
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/huggingface_datasetio.db
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/localfs_datasetio.db
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/registry.db
-inference_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/inference_store.db
-models:
-- metadata: {}
-  model_id: openai/gpt-4o
-  provider_id: openai
-  provider_model_id: openai/gpt-4o
-  model_type: llm
-- metadata: {}
-  model_id: openai/gpt-4o-mini
-  provider_id: openai
-  provider_model_id: openai/gpt-4o-mini
-  model_type: llm
-- metadata: {}
-  model_id: openai/chatgpt-4o-latest
-  provider_id: openai
-  provider_model_id: openai/chatgpt-4o-latest
-  model_type: llm
-- metadata: {}
-  model_id: gpt-3.5-turbo-0125
-  provider_id: openai
-  provider_model_id: gpt-3.5-turbo-0125
-  model_type: llm
-- metadata: {}
-  model_id: gpt-3.5-turbo
-  provider_id: openai
-  provider_model_id: gpt-3.5-turbo
-  model_type: llm
-- metadata: {}
-  model_id: gpt-3.5-turbo-instruct
-  provider_id: openai
-  provider_model_id: gpt-3.5-turbo-instruct
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4
-  provider_id: openai
-  provider_model_id: gpt-4
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4-turbo
-  provider_id: openai
-  provider_model_id: gpt-4-turbo
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4o
-  provider_id: openai
-  provider_model_id: gpt-4o
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4o-2024-08-06
-  provider_id: openai
-  provider_model_id: gpt-4o-2024-08-06
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4o-mini
-  provider_id: openai
-  provider_model_id: gpt-4o-mini
-  model_type: llm
-- metadata: {}
-  model_id: gpt-4o-audio-preview
-  provider_id: openai
-  provider_model_id: gpt-4o-audio-preview
-  model_type: llm
-- metadata: {}
-  model_id: chatgpt-4o-latest
-  provider_id: openai
-  provider_model_id: chatgpt-4o-latest
-  model_type: llm
-- metadata: {}
-  model_id: o1
-  provider_id: openai
-  provider_model_id: o1
-  model_type: llm
-- metadata: {}
-  model_id: o1-mini
-  provider_id: openai
-  provider_model_id: o1-mini
-  model_type: llm
-- metadata: {}
-  model_id: o3-mini
-  provider_id: openai
-  provider_model_id: o3-mini
-  model_type: llm
-- metadata: {}
-  model_id: o4-mini
-  provider_id: openai
-  provider_model_id: o4-mini
-  model_type: llm
-- metadata:
-    embedding_dimension: 1536
-    context_length: 8192
-  model_id: openai/text-embedding-3-small
-  provider_id: openai
-  provider_model_id: openai/text-embedding-3-small
-  model_type: embedding
-- metadata:
-    embedding_dimension: 3072
-    context_length: 8192
-  model_id: openai/text-embedding-3-large
-  provider_id: openai
-  provider_model_id: openai/text-embedding-3-large
-  model_type: embedding
-- metadata:
-    embedding_dimension: 1536
-    context_length: 8192
-  model_id: text-embedding-3-small
-  provider_id: openai
-  provider_model_id: text-embedding-3-small
-  model_type: embedding
-- metadata:
-    embedding_dimension: 3072
-    context_length: 8192
-  model_id: text-embedding-3-large
-  provider_id: openai
-  provider_model_id: text-embedding-3-large
-  model_type: embedding
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-8b
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
-  model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: fireworks-openai-compat
-  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
-  model_type: llm
-- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: nomic-ai/nomic-embed-text-v1.5
-  provider_id: fireworks-openai-compat
-  provider_model_id: nomic-ai/nomic-embed-text-v1.5
-  model_type: embedding
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-Guard-3-8B
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  model_type: llm
-- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: togethercomputer/m2-bert-80M-8k-retrieval
-  provider_id: together-openai-compat
-  provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval
-  model_type: embedding
-- metadata:
-    embedding_dimension: 768
-    context_length: 32768
-  model_id: togethercomputer/m2-bert-80M-32k-retrieval
-  provider_id: together-openai-compat
-  provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval
-  model_type: embedding
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
-- metadata: {}
-  model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  provider_id: together-openai-compat
-  provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama3-8b-8192
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.1-8b-instant
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-3.1-8b-instant
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama3-70b-8192
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.3-70b-versatile
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.2-3b-preview
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-4-scout-17b-16e-instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-4-scout-17b-16e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-4-scout-17b-16e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-4-maverick-17b-128e-instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-4-maverick-17b-128e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/llama-4-maverick-17b-128e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: groq-openai-compat
-  provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: sambanova/Meta-Llama-Guard-3-8B
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: sambanova-openai-compat
-  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: llama3.1-8b
-  provider_id: cerebras-openai-compat
-  provider_model_id: llama3.1-8b
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: cerebras-openai-compat
-  provider_model_id: llama3.1-8b
-  model_type: llm
-- metadata: {}
-  model_id: llama-3.3-70b
-  provider_id: cerebras-openai-compat
-  provider_model_id: llama-3.3-70b
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: cerebras-openai-compat
-  provider_model_id: llama-3.3-70b
-  model_type: llm
-- metadata:
-    embedding_dimension: 384
-  model_id: all-MiniLM-L6-v2
-  provider_id: sentence-transformers
-  model_type: embedding
-shields:
-- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
-- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
-- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-server:
-  port: 8321
diff --git a/llama_stack/templates/verification/verification.py b/llama_stack/templates/verification/verification.py
deleted file mode 100644
index b58400f26..000000000
--- a/llama_stack/templates/verification/verification.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.models.models import ModelType
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import (
-    SentenceTransformersInferenceConfig,
-)
-from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
-    SQLiteVectorIOConfig,
-)
-from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES as CEREBRAS_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.cerebras_openai_compat.config import CerebrasCompatConfig
-from llama_stack.providers.remote.inference.fireworks.models import (
-    MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.fireworks_openai_compat.config import FireworksCompatConfig
-from llama_stack.providers.remote.inference.groq.models import (
-    MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.groq_openai_compat.config import GroqCompatConfig
-from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
-from llama_stack.providers.remote.inference.openai.models import (
-    MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.sambanova_openai_compat.config import SambaNovaCompatConfig
-from llama_stack.providers.remote.inference.together.models import (
-    MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES,
-)
-from llama_stack.providers.remote.inference.together_openai_compat.config import TogetherCompatConfig
-from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.remote.vector_io.pgvector.config import (
-    PGVectorVectorIOConfig,
-)
-from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack.templates.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-    get_model_registry,
-)
-
-
-def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
-    # in this template, we allow each API key to be optional
-    providers = [
-        (
-            "openai",
-            OPENAI_MODEL_ENTRIES,
-            OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"),
-        ),
-        (
-            "fireworks-openai-compat",
-            FIREWORKS_MODEL_ENTRIES,
-            FireworksCompatConfig.sample_run_config(api_key="${env.FIREWORKS_API_KEY:}"),
-        ),
-        (
-            "together-openai-compat",
-            TOGETHER_MODEL_ENTRIES,
-            TogetherCompatConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"),
-        ),
-        (
-            "groq-openai-compat",
-            GROQ_MODEL_ENTRIES,
-            GroqCompatConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"),
-        ),
-        (
-            "sambanova-openai-compat",
-            SAMBANOVA_MODEL_ENTRIES,
-            SambaNovaCompatConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:}"),
-        ),
-        (
-            "cerebras-openai-compat",
-            CEREBRAS_MODEL_ENTRIES,
-            CerebrasCompatConfig.sample_run_config(api_key="${env.CEREBRAS_API_KEY:}"),
-        ),
-    ]
-    inference_providers = []
-    available_models = {}
-    for provider_id, model_entries, config in providers:
-        inference_providers.append(
-            Provider(
-                provider_id=provider_id,
-                provider_type=f"remote::{provider_id}",
-                config=config,
-            )
-        )
-        available_models[provider_id] = model_entries
-    return inference_providers, available_models
-
-
-def get_distribution_template() -> DistributionTemplate:
-    inference_providers, available_models = get_inference_providers()
-    providers = {
-        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "verification"
-
-    vector_io_providers = [
-        Provider(
-            provider_id="sqlite-vec",
-            provider_type="inline::sqlite-vec",
-            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB+chromadb}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_PGVECTOR+pgvector}",
-            provider_type="remote::pgvector",
-            config=PGVectorVectorIOConfig.sample_run_config(
-                db="${env.PGVECTOR_DB:}",
-                user="${env.PGVECTOR_USER:}",
-                password="${env.PGVECTOR_PASSWORD:}",
-            ),
-        ),
-    ]
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-    embedding_model = ModelInput(
-        model_id="all-MiniLM-L6-v2",
-        provider_id=embedding_provider.provider_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 384,
-        },
-    )
-
-    default_models = get_model_registry(available_models)
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Distribution for running e2e tests in CI",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": inference_providers + [embedding_provider],
-                    "vector_io": vector_io_providers,
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-            "FIREWORKS_API_KEY": (
-                "",
-                "Fireworks API Key",
-            ),
-            "OPENAI_API_KEY": (
-                "",
-                "OpenAI API Key",
-            ),
-        },
-    )
diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml
index 5a9d003cb..d5ff0f1f4 100644
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@@ -31,4 +31,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 6937e2bac..6878c22b2 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -52,7 +52,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/trace_store.db
   eval:
diff --git a/llama_stack/templates/watsonx/build.yaml b/llama_stack/templates/watsonx/build.yaml
index 87233fb26..e68ace183 100644
--- a/llama_stack/templates/watsonx/build.yaml
+++ b/llama_stack/templates/watsonx/build.yaml
@@ -29,4 +29,5 @@ distribution_spec:
     - remote::model-context-protocol
 image_type: conda
 additional_pip_packages:
+- aiosqlite
 - sqlalchemy[asyncio]
diff --git a/llama_stack/templates/watsonx/run.yaml b/llama_stack/templates/watsonx/run.yaml
index e7222fd57..d60a87906 100644
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@@ -49,7 +49,7 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: ${env.OTEL_SERVICE_NAME:}
+      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/trace_store.db
   eval:
diff --git a/llama_stack/ui/app/api/v1/[...path]/route.ts b/llama_stack/ui/app/api/v1/[...path]/route.ts
new file mode 100644
index 000000000..1959f9099
--- /dev/null
+++ b/llama_stack/ui/app/api/v1/[...path]/route.ts
@@ -0,0 +1,105 @@
+import { NextRequest, NextResponse } from "next/server";
+
+// Get backend URL from environment variable or default to localhost for development
+const BACKEND_URL =
+  process.env.LLAMA_STACK_BACKEND_URL ||
+  `http://localhost:${process.env.LLAMA_STACK_PORT || 8321}`;
+
+async function proxyRequest(request: NextRequest, method: string) {
+  try {
+    // Extract the path from the request URL
+    const url = new URL(request.url);
+    const pathSegments = url.pathname.split("/");
+
+    // Remove /api from the path to get the actual API path
+    // /api/v1/models/list -> /v1/models/list
+    const apiPath = pathSegments.slice(2).join("/"); // Remove 'api' segment
+    const targetUrl = `${BACKEND_URL}/${apiPath}${url.search}`;
+
+    console.log(`Proxying ${method} ${url.pathname} -> ${targetUrl}`);
+
+    // Prepare headers (exclude host and other problematic headers)
+    const headers = new Headers();
+    request.headers.forEach((value, key) => {
+      // Skip headers that might cause issues in proxy
+      if (
+        !["host", "connection", "content-length"].includes(key.toLowerCase())
+      ) {
+        headers.set(key, value);
+      }
+    });
+
+    // Prepare the request options
+    const requestOptions: RequestInit = {
+      method,
+      headers,
+    };
+
+    // Add body for methods that support it
+    if (["POST", "PUT", "PATCH"].includes(method) && request.body) {
+      requestOptions.body = await request.text();
+    }
+
+    // Make the request to FastAPI backend
+    const response = await fetch(targetUrl, requestOptions);
+
+    // Get response data
+    const responseText = await response.text();
+
+    console.log(
+      `Response from FastAPI: ${response.status} ${response.statusText}`,
+    );
+
+    // Create response with same status and headers
+    const proxyResponse = new NextResponse(responseText, {
+      status: response.status,
+      statusText: response.statusText,
+    });
+
+    // Copy response headers (except problematic ones)
+    response.headers.forEach((value, key) => {
+      if (!["connection", "transfer-encoding"].includes(key.toLowerCase())) {
+        proxyResponse.headers.set(key, value);
+      }
+    });
+
+    return proxyResponse;
+  } catch (error) {
+    console.error("Proxy request failed:", error);
+
+    return NextResponse.json(
+      {
+        error: "Proxy request failed",
+        message: error instanceof Error ? error.message : "Unknown error",
+        backend_url: BACKEND_URL,
+        timestamp: new Date().toISOString(),
+      },
+      { status: 500 },
+    );
+  }
+}
+
+// HTTP method handlers
+export async function GET(request: NextRequest) {
+  return proxyRequest(request, "GET");
+}
+
+export async function POST(request: NextRequest) {
+  return proxyRequest(request, "POST");
+}
+
+export async function PUT(request: NextRequest) {
+  return proxyRequest(request, "PUT");
+}
+
+export async function DELETE(request: NextRequest) {
+  return proxyRequest(request, "DELETE");
+}
+
+export async function PATCH(request: NextRequest) {
+  return proxyRequest(request, "PATCH");
+}
+
+export async function OPTIONS(request: NextRequest) {
+  return proxyRequest(request, "OPTIONS");
+}
diff --git a/llama_stack/ui/lib/client.ts b/llama_stack/ui/lib/client.ts
index df2a8e2f2..8492496e2 100644
--- a/llama_stack/ui/lib/client.ts
+++ b/llama_stack/ui/lib/client.ts
@@ -1,12 +1,6 @@
 import LlamaStackClient from "llama-stack-client";
-import OpenAI from "openai";
 
-export const client =
-  process.env.NEXT_PUBLIC_USE_OPENAI_CLIENT === "true" // useful for testing
-    ? new OpenAI({
-        apiKey: process.env.NEXT_PUBLIC_OPENAI_API_KEY,
-        dangerouslyAllowBrowser: true,
-      })
-    : new LlamaStackClient({
-        baseURL: process.env.NEXT_PUBLIC_LLAMA_STACK_BASE_URL,
-      });
+export const client = new LlamaStackClient({
+  baseURL:
+    typeof window !== "undefined" ? `${window.location.origin}/api` : "/api",
+});
diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index 931faa60a..3c60dbb39 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -15,11 +15,10 @@
         "@radix-ui/react-tooltip": "^1.2.6",
         "class-variance-authority": "^0.7.1",
         "clsx": "^2.1.1",
-        "llama-stack-client": "github:stainless-sdks/llama-stack-node#ehhuang/dev",
+        "llama-stack-client": "0.2.9",
         "lucide-react": "^0.510.0",
         "next": "15.3.2",
         "next-themes": "^0.4.6",
-        "openai": "^4.103.0",
         "react": "^19.0.0",
         "react-dom": "^19.0.0",
         "tailwind-merge": "^3.3.0"
@@ -677,6 +676,406 @@
         "tslib": "^2.4.0"
       }
     },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.5.tgz",
+      "integrity": "sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.5.tgz",
+      "integrity": "sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.5.tgz",
+      "integrity": "sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.5.tgz",
+      "integrity": "sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.5.tgz",
+      "integrity": "sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.5.tgz",
+      "integrity": "sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.5.tgz",
+      "integrity": "sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.5.tgz",
+      "integrity": "sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.5.tgz",
+      "integrity": "sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==",
+      "cpu": [
+        "arm"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.5.tgz",
+      "integrity": "sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.5.tgz",
+      "integrity": "sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.5.tgz",
+      "integrity": "sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==",
+      "cpu": [
+        "loong64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.5.tgz",
+      "integrity": "sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==",
+      "cpu": [
+        "mips64el"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.5.tgz",
+      "integrity": "sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.5.tgz",
+      "integrity": "sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.5.tgz",
+      "integrity": "sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==",
+      "cpu": [
+        "s390x"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.5.tgz",
+      "integrity": "sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.5.tgz",
+      "integrity": "sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.5.tgz",
+      "integrity": "sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.5.tgz",
+      "integrity": "sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.5.tgz",
+      "integrity": "sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.5.tgz",
+      "integrity": "sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.5.tgz",
+      "integrity": "sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.5.tgz",
+      "integrity": "sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.5.tgz",
+      "integrity": "sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
     "node_modules/@eslint-community/eslint-utils": {
       "version": "4.7.0",
       "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.7.0.tgz",
@@ -5601,6 +6000,46 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/esbuild": {
+      "version": "0.25.5",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.5.tgz",
+      "integrity": "sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.25.5",
+        "@esbuild/android-arm": "0.25.5",
+        "@esbuild/android-arm64": "0.25.5",
+        "@esbuild/android-x64": "0.25.5",
+        "@esbuild/darwin-arm64": "0.25.5",
+        "@esbuild/darwin-x64": "0.25.5",
+        "@esbuild/freebsd-arm64": "0.25.5",
+        "@esbuild/freebsd-x64": "0.25.5",
+        "@esbuild/linux-arm": "0.25.5",
+        "@esbuild/linux-arm64": "0.25.5",
+        "@esbuild/linux-ia32": "0.25.5",
+        "@esbuild/linux-loong64": "0.25.5",
+        "@esbuild/linux-mips64el": "0.25.5",
+        "@esbuild/linux-ppc64": "0.25.5",
+        "@esbuild/linux-riscv64": "0.25.5",
+        "@esbuild/linux-s390x": "0.25.5",
+        "@esbuild/linux-x64": "0.25.5",
+        "@esbuild/netbsd-arm64": "0.25.5",
+        "@esbuild/netbsd-x64": "0.25.5",
+        "@esbuild/openbsd-arm64": "0.25.5",
+        "@esbuild/openbsd-x64": "0.25.5",
+        "@esbuild/sunos-x64": "0.25.5",
+        "@esbuild/win32-arm64": "0.25.5",
+        "@esbuild/win32-ia32": "0.25.5",
+        "@esbuild/win32-x64": "0.25.5"
+      }
+    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
@@ -6555,7 +6994,6 @@
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
       "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
-      "dev": true,
       "hasInstallScript": true,
       "license": "MIT",
       "optional": true,
@@ -6717,7 +7155,6 @@
       "version": "4.10.0",
       "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.10.0.tgz",
       "integrity": "sha512-kGzZ3LWWQcGIAmg6iWvXn0ei6WDtV26wzHRMwDSzmAbcXrTEXxHy6IehI6/4eT6VRKyMP1eF1VqwrVUmE/LR7A==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "resolve-pkg-maps": "^1.0.0"
@@ -9092,8 +9529,9 @@
       "license": "MIT"
     },
     "node_modules/llama-stack-client": {
-      "version": "0.0.1-alpha.0",
-      "resolved": "git+ssh://git@github.com/stainless-sdks/llama-stack-node.git#5d34d229fb53b6dad02da0f19f4b310b529c6b15",
+      "version": "0.2.9",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.9.tgz",
+      "integrity": "sha512-7+2WuPYt2j/k/Twh5IGn8hd8q4W6lVEK+Ql4PpICGLj4N8YmooCfydI1UvdT2UlX7PNYKNeyeFqTifWT2MjWKg==",
       "license": "Apache-2.0",
       "dependencies": {
         "@types/node": "^18.11.18",
@@ -9102,7 +9540,8 @@
         "agentkeepalive": "^4.2.1",
         "form-data-encoder": "1.7.2",
         "formdata-node": "^4.3.2",
-        "node-fetch": "^2.6.7"
+        "node-fetch": "^2.6.7",
+        "tsx": "^4.19.2"
       }
     },
     "node_modules/llama-stack-client/node_modules/@types/node": {
@@ -9805,51 +10244,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/openai": {
-      "version": "4.103.0",
-      "resolved": "https://registry.npmjs.org/openai/-/openai-4.103.0.tgz",
-      "integrity": "sha512-eWcz9kdurkGOFDtd5ySS5y251H2uBgq9+1a2lTBnjMMzlexJ40Am5t6Mu76SSE87VvitPa0dkIAp75F+dZVC0g==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@types/node": "^18.11.18",
-        "@types/node-fetch": "^2.6.4",
-        "abort-controller": "^3.0.0",
-        "agentkeepalive": "^4.2.1",
-        "form-data-encoder": "1.7.2",
-        "formdata-node": "^4.3.2",
-        "node-fetch": "^2.6.7"
-      },
-      "bin": {
-        "openai": "bin/cli"
-      },
-      "peerDependencies": {
-        "ws": "^8.18.0",
-        "zod": "^3.23.8"
-      },
-      "peerDependenciesMeta": {
-        "ws": {
-          "optional": true
-        },
-        "zod": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/openai/node_modules/@types/node": {
-      "version": "18.19.103",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.103.tgz",
-      "integrity": "sha512-hHTHp+sEz6SxFsp+SA+Tqrua3AbmlAw+Y//aEwdHrdZkYVRWdvWD3y5uPZ0flYOkgskaFWqZ/YGFm3FaFQ0pRw==",
-      "license": "MIT",
-      "dependencies": {
-        "undici-types": "~5.26.4"
-      }
-    },
-    "node_modules/openai/node_modules/undici-types": {
-      "version": "5.26.5",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
-      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
-      "license": "MIT"
-    },
     "node_modules/optionator": {
       "version": "0.9.4",
       "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -10631,7 +11025,6 @@
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
       "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
-      "dev": true,
       "license": "MIT",
       "funding": {
         "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
@@ -11682,6 +12075,25 @@
       "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
       "license": "0BSD"
     },
+    "node_modules/tsx": {
+      "version": "4.19.4",
+      "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.19.4.tgz",
+      "integrity": "sha512-gK5GVzDkJK1SI1zwHf32Mqxf2tSJkNx+eYcNly5+nHvWqXUJYUkWBQtKauoESz3ymezAI++ZwT855x5p5eop+Q==",
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "~0.25.0",
+        "get-tsconfig": "^4.7.5"
+      },
+      "bin": {
+        "tsx": "dist/cli.mjs"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      }
+    },
     "node_modules/tw-animate-css": {
       "version": "1.2.9",
       "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.2.9.tgz",
@@ -12269,7 +12681,7 @@
       "version": "8.18.2",
       "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.2.tgz",
       "integrity": "sha512-DMricUmwGZUVr++AEAe2uiVM7UoO9MAVZMDu05UQOaUII0lp+zOzLLU4Xqh/JvTqklB1T4uELaaPBKyjE1r4fQ==",
-      "devOptional": true,
+      "dev": true,
       "license": "MIT",
       "engines": {
         "node": ">=10.0.0"
@@ -12380,7 +12792,7 @@
       "version": "3.24.4",
       "resolved": "https://registry.npmjs.org/zod/-/zod-3.24.4.tgz",
       "integrity": "sha512-OdqJE9UDRPwWsrHjLN2F8bPxvwJBK22EHLWtanu0LSYr5YqzsaaW3RMgmjwr8Rypg5k+meEJdSPXJZXE/yqOMg==",
-      "devOptional": true,
+      "dev": true,
       "license": "MIT",
       "funding": {
         "url": "https://github.com/sponsors/colinhacks"
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index e961595cc..e6c49f182 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -19,7 +19,7 @@
     "@radix-ui/react-tooltip": "^1.2.6",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
-    "llama-stack-client": "0.2.8",
+    "llama-stack-client": "0.2.10",
     "lucide-react": "^0.510.0",
     "next": "15.3.2",
     "next-themes": "^0.4.6",
diff --git a/pyproject.toml b/pyproject.toml
index 2bb6292aa..80025929d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.2.8"
+version = "0.2.10"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -22,12 +22,13 @@ classifiers = [
 ]
 dependencies = [
     "aiohttp",
+    "fastapi>=0.115.0,<1.0",
     "fire",
     "httpx",
     "huggingface-hub",
     "jinja2>=3.1.6",
     "jsonschema",
-    "llama-stack-client>=0.2.8",
+    "llama-stack-client>=0.2.10",
     "openai>=1.66",
     "prompt-toolkit",
     "python-dotenv",
@@ -41,13 +42,14 @@ dependencies = [
     "tiktoken",
     "pillow",
     "h11>=0.16.0",
+    "python-multipart>=0.0.20",
 ]
 
 [project.optional-dependencies]
 ui = [
     "streamlit",
     "pandas",
-    "llama-stack-client>=0.2.8",
+    "llama-stack-client>=0.2.10",
     "streamlit-option-menu",
 ]
 
@@ -66,7 +68,6 @@ dev = [
     "types-setuptools",
     "pre-commit",
     "uvicorn",
-    "fastapi",
     "ruamel.yaml",        # needed for openapi generator
 ]
 # These are the dependencies required for running unit tests.
@@ -130,9 +131,9 @@ Homepage = "https://github.com/meta-llama/llama-stack"
 llama = "llama_stack.cli.llama:main"
 install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned"
 
-[tool.setuptools]
-packages = { find = {} }
-license-files = []
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["llama_stack", "llama_stack.*"]
 
 [[tool.uv.index]]
 name = "pytorch-cpu"
diff --git a/requirements.txt b/requirements.txt
index 0c079a855..cfd63b456 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,6 +42,8 @@ ecdsa==0.19.1
     # via python-jose
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
     # via anyio
+fastapi==0.115.8
+    # via llama-stack
 filelock==3.17.0
     # via huggingface-hub
 fire==0.7.0
@@ -79,7 +81,7 @@ jsonschema==4.23.0
     # via llama-stack
 jsonschema-specifications==2024.10.1
     # via jsonschema
-llama-stack-client==0.2.8
+llama-stack-client==0.2.10
     # via llama-stack
 markdown-it-py==3.0.0
     # via rich
@@ -117,6 +119,7 @@ pyasn1==0.4.8
     #   rsa
 pydantic==2.10.6
     # via
+    #   fastapi
     #   llama-stack
     #   llama-stack-client
     #   openai
@@ -130,6 +133,8 @@ python-dotenv==1.0.1
     # via llama-stack
 python-jose==3.4.0
     # via llama-stack
+python-multipart==0.0.20
+    # via llama-stack
 pytz==2025.1
     # via pandas
 pyyaml==6.0.2
@@ -169,7 +174,9 @@ sniffio==1.3.1
     #   llama-stack-client
     #   openai
 starlette==0.45.3
-    # via llama-stack
+    # via
+    #   fastapi
+    #   llama-stack
 termcolor==2.5.0
     # via
     #   fire
@@ -185,6 +192,7 @@ tqdm==4.67.1
 typing-extensions==4.12.2
     # via
     #   anyio
+    #   fastapi
     #   huggingface-hub
     #   llama-stack-client
     #   multidict
diff --git a/scripts/distro_codegen.py b/scripts/distro_codegen.py
index 8820caf55..b59cd3481 100755
--- a/scripts/distro_codegen.py
+++ b/scripts/distro_codegen.py
@@ -15,11 +15,6 @@ from pathlib import Path
 
 from rich.progress import Progress, SpinnerColumn, TextColumn
 
-from llama_stack.distribution.build import (
-    SERVER_DEPENDENCIES,
-    get_provider_dependencies,
-)
-
 REPO_ROOT = Path(__file__).parent.parent
 
 
@@ -90,21 +85,11 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool:
     return has_changes
 
 
-def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]:
-    try:
+def pre_import_templates(template_dirs: list[Path]) -> None:
+    # Pre-import all template modules to avoid deadlocks.
+    for template_dir in template_dirs:
         module_name = f"llama_stack.templates.{template_dir.name}"
-        module = importlib.import_module(module_name)
-
-        if template_func := getattr(module, "get_distribution_template", None):
-            template = template_func()
-            normal_deps, special_deps = get_provider_dependencies(template)
-            # Combine all dependencies in order: normal deps, special deps, server deps
-            all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps))
-
-            return template.name, all_deps
-    except Exception:
-        return None, []
-    return None, []
+        importlib.import_module(module_name)
 
 
 def main():
@@ -118,6 +103,8 @@ def main():
         template_dirs = list(find_template_dirs(templates_dir))
         task = progress.add_task("Processing distribution templates...", total=len(template_dirs))
 
+        pre_import_templates(template_dirs)
+
         # Create a partial function with the progress bar
         process_func = partial(process_template, progress=progress, change_tracker=change_tracker)
 
diff --git a/tests/Containerfile b/tests/Containerfile
new file mode 100644
index 000000000..3080d053a
--- /dev/null
+++ b/tests/Containerfile
@@ -0,0 +1,13 @@
+# Containerfile used to build our all in one ollama image to run tests in CI
+# podman build --platform linux/amd64 -f Containerfile -t ollama-with-models .
+#
+FROM --platform=linux/amd64 ollama/ollama:latest
+
+# Start ollama and pull models in a single layer
+RUN ollama serve & \
+    sleep 5 && \
+    ollama pull llama3.2:3b-instruct-fp16 && \
+    ollama pull all-minilm:latest
+
+# Set the entrypoint to start ollama serve
+ENTRYPOINT ["ollama", "serve"]
diff --git a/tests/common/mcp.py b/tests/common/mcp.py
index fd7040c6c..775e38295 100644
--- a/tests/common/mcp.py
+++ b/tests/common/mcp.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 # we want the mcp server to be authenticated OR not, depends
+from collections.abc import Callable
 from contextlib import contextmanager
 
 # Unfortunately the toolgroup id must be tied to the tool names because the registry
@@ -13,15 +14,158 @@ from contextlib import contextmanager
 MCP_TOOLGROUP_ID = "mcp::localmcp"
 
 
+def default_tools():
+    """Default tools for backward compatibility."""
+    from mcp import types
+    from mcp.server.fastmcp import Context
+
+    async def greet_everyone(
+        url: str, ctx: Context
+    ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
+        return [types.TextContent(type="text", text="Hello, world!")]
+
+    async def get_boiling_point(liquid_name: str, celsius: bool = True) -> int:
+        """
+        Returns the boiling point of a liquid in Celsius or Fahrenheit.
+
+        :param liquid_name: The name of the liquid
+        :param celsius: Whether to return the boiling point in Celsius
+        :return: The boiling point of the liquid in Celcius or Fahrenheit
+        """
+        if liquid_name.lower() == "myawesomeliquid":
+            if celsius:
+                return -100
+            else:
+                return -212
+        else:
+            return -1
+
+    return {"greet_everyone": greet_everyone, "get_boiling_point": get_boiling_point}
+
+
+def dependency_tools():
+    """Tools with natural dependencies for multi-turn testing."""
+    from mcp import types
+    from mcp.server.fastmcp import Context
+
+    async def get_user_id(username: str, ctx: Context) -> str:
+        """
+        Get the user ID for a given username. This ID is needed for other operations.
+
+        :param username: The username to look up
+        :return: The user ID for the username
+        """
+        # Simple mapping for testing
+        user_mapping = {"alice": "user_12345", "bob": "user_67890", "charlie": "user_11111", "admin": "user_00000"}
+        return user_mapping.get(username.lower(), "user_99999")
+
+    async def get_user_permissions(user_id: str, ctx: Context) -> str:
+        """
+        Get the permissions for a user ID. Requires a valid user ID from get_user_id.
+
+        :param user_id: The user ID to check permissions for
+        :return: The permissions for the user
+        """
+        # Permission mapping based on user IDs
+        permission_mapping = {
+            "user_12345": "read,write",  # alice
+            "user_67890": "read",  # bob
+            "user_11111": "admin",  # charlie
+            "user_00000": "superadmin",  # admin
+            "user_99999": "none",  # unknown users
+        }
+        return permission_mapping.get(user_id, "none")
+
+    async def check_file_access(user_id: str, filename: str, ctx: Context) -> str:
+        """
+        Check if a user can access a specific file. Requires a valid user ID.
+
+        :param user_id: The user ID to check access for
+        :param filename: The filename to check access to
+        :return: Whether the user can access the file (yes/no)
+        """
+        # Get permissions first
+        permission_mapping = {
+            "user_12345": "read,write",  # alice
+            "user_67890": "read",  # bob
+            "user_11111": "admin",  # charlie
+            "user_00000": "superadmin",  # admin
+            "user_99999": "none",  # unknown users
+        }
+        permissions = permission_mapping.get(user_id, "none")
+
+        # Check file access based on permissions and filename
+        if permissions == "superadmin":
+            access = "yes"
+        elif permissions == "admin":
+            access = "yes" if not filename.startswith("secret_") else "no"
+        elif "write" in permissions:
+            access = "yes" if filename.endswith(".txt") else "no"
+        elif "read" in permissions:
+            access = "yes" if filename.endswith(".txt") or filename.endswith(".md") else "no"
+        else:
+            access = "no"
+
+        return [types.TextContent(type="text", text=access)]
+
+    async def get_experiment_id(experiment_name: str, ctx: Context) -> str:
+        """
+        Get the experiment ID for a given experiment name. This ID is needed to get results.
+
+        :param experiment_name: The name of the experiment
+        :return: The experiment ID
+        """
+        # Simple mapping for testing
+        experiment_mapping = {
+            "temperature_test": "exp_001",
+            "pressure_test": "exp_002",
+            "chemical_reaction": "exp_003",
+            "boiling_point": "exp_004",
+        }
+        exp_id = experiment_mapping.get(experiment_name.lower(), "exp_999")
+        return exp_id
+
+    async def get_experiment_results(experiment_id: str, ctx: Context) -> str:
+        """
+        Get the results for an experiment ID. Requires a valid experiment ID from get_experiment_id.
+
+        :param experiment_id: The experiment ID to get results for
+        :return: The experiment results
+        """
+        # Results mapping based on experiment IDs
+        results_mapping = {
+            "exp_001": "Temperature: 25°C, Status: Success",
+            "exp_002": "Pressure: 1.2 atm, Status: Success",
+            "exp_003": "Yield: 85%, Status: Complete",
+            "exp_004": "Boiling Point: 100°C, Status: Verified",
+            "exp_999": "No results found",
+        }
+        results = results_mapping.get(experiment_id, "Invalid experiment ID")
+        return results
+
+    return {
+        "get_user_id": get_user_id,
+        "get_user_permissions": get_user_permissions,
+        "check_file_access": check_file_access,
+        "get_experiment_id": get_experiment_id,
+        "get_experiment_results": get_experiment_results,
+    }
+
+
 @contextmanager
-def make_mcp_server(required_auth_token: str | None = None):
+def make_mcp_server(required_auth_token: str | None = None, tools: dict[str, Callable] | None = None):
+    """
+    Create an MCP server with the specified tools.
+
+    :param required_auth_token: Optional auth token required for access
+    :param tools: Dictionary of tool_name -> tool_function. If None, uses default tools.
+    """
     import threading
     import time
 
     import httpx
     import uvicorn
-    from mcp import types
-    from mcp.server.fastmcp import Context, FastMCP
+    from mcp.server.fastmcp import FastMCP
     from mcp.server.sse import SseServerTransport
     from starlette.applications import Starlette
     from starlette.responses import Response
@@ -29,35 +173,18 @@ def make_mcp_server(required_auth_token: str | None = None):
 
     server = FastMCP("FastMCP Test Server", log_level="WARNING")
 
-    @server.tool()
-    async def greet_everyone(
-        url: str, ctx: Context
-    ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
-        return [types.TextContent(type="text", text="Hello, world!")]
+    tools = tools or default_tools()
 
-    @server.tool()
-    async def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
-        """
-        Returns the boiling point of a liquid in Celcius or Fahrenheit.
-
-        :param liquid_name: The name of the liquid
-        :param celcius: Whether to return the boiling point in Celcius
-        :return: The boiling point of the liquid in Celcius or Fahrenheit
-        """
-        if liquid_name.lower() == "polyjuice":
-            if celcius:
-                return -100
-            else:
-                return -212
-        else:
-            return -1
+    # Register all tools with the server
+    for tool_func in tools.values():
+        server.tool()(tool_func)
 
     sse = SseServerTransport("/messages/")
 
     async def handle_sse(request):
         from starlette.exceptions import HTTPException
 
-        auth_header = request.headers.get("Authorization")
+        auth_header: str | None = request.headers.get("Authorization")
         auth_token = None
         if auth_header and auth_header.startswith("Bearer "):
             auth_token = auth_header.split(" ")[1]
diff --git a/tests/integration/files/__init__.py b/tests/integration/files/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/integration/files/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/files/test_files.py b/tests/integration/files/test_files.py
new file mode 100644
index 000000000..8375507dc
--- /dev/null
+++ b/tests/integration/files/test_files.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from io import BytesIO
+
+import pytest
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+
+def test_openai_client_basic_operations(openai_client, client_with_models):
+    """Test basic file operations through OpenAI client."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI files are not supported when testing with library client yet.")
+    client = openai_client
+
+    test_content = b"files test content"
+
+    try:
+        # Upload file using OpenAI client
+        with BytesIO(test_content) as file_buffer:
+            file_buffer.name = "openai_test.txt"
+            uploaded_file = client.files.create(file=file_buffer, purpose="assistants")
+
+        # Verify basic response structure
+        assert uploaded_file.id.startswith("file-")
+        assert hasattr(uploaded_file, "filename")
+
+        # List files
+        files_list = client.files.list()
+        file_ids = [f.id for f in files_list.data]
+        assert uploaded_file.id in file_ids
+
+        # Retrieve file info
+        retrieved_file = client.files.retrieve(uploaded_file.id)
+        assert retrieved_file.id == uploaded_file.id
+
+        # Retrieve file content - OpenAI client returns httpx Response object
+        content_response = client.files.content(uploaded_file.id)
+        # The response is an httpx Response object with .content attribute containing bytes
+        content = content_response.content
+        assert content == test_content
+
+        # Delete file
+        delete_response = client.files.delete(uploaded_file.id)
+        assert delete_response.deleted is True
+
+    except Exception as e:
+        # Cleanup in case of failure
+        try:
+            client.files.delete(uploaded_file.id)
+        except Exception:
+            pass
+        raise e
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index 2cd76a23d..190840f70 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -268,9 +268,9 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
         False,
     ],
 )
-def test_inference_store(openai_client, client_with_models, text_model_id, stream):
+def test_inference_store(compat_client, client_with_models, text_model_id, stream):
     skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-    client = openai_client
+    client = compat_client
     # make a chat completion
     message = "Hello, world!"
     response = client.chat.completions.create(
@@ -301,9 +301,14 @@ def test_inference_store(openai_client, client_with_models, text_model_id, strea
 
     retrieved_response = client.chat.completions.retrieve(response_id)
     assert retrieved_response.id == response_id
-    assert retrieved_response.input_messages[0]["content"] == message, retrieved_response
     assert retrieved_response.choices[0].message.content == content, retrieved_response
 
+    input_content = (
+        getattr(retrieved_response.input_messages[0], "content", None)
+        or retrieved_response.input_messages[0]["content"]
+    )
+    assert input_content == message, retrieved_response
+
 
 @pytest.mark.parametrize(
     "stream",
@@ -312,9 +317,9 @@ def test_inference_store(openai_client, client_with_models, text_model_id, strea
         False,
     ],
 )
-def test_inference_store_tool_calls(openai_client, client_with_models, text_model_id, stream):
+def test_inference_store_tool_calls(compat_client, client_with_models, text_model_id, stream):
     skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-    client = openai_client
+    client = compat_client
     # make a chat completion
     message = "What's the weather in Tokyo? Use the get_weather function to get the weather."
     response = client.chat.completions.create(
@@ -361,7 +366,11 @@ def test_inference_store_tool_calls(openai_client, client_with_models, text_mode
 
     retrieved_response = client.chat.completions.retrieve(response_id)
     assert retrieved_response.id == response_id
-    assert retrieved_response.input_messages[0]["content"] == message
+    input_content = (
+        getattr(retrieved_response.input_messages[0], "content", None)
+        or retrieved_response.input_messages[0]["content"]
+    )
+    assert input_content == message, retrieved_response
     tool_calls = retrieved_response.choices[0].message.tool_calls
     # sometimes model doesn't ouptut tool calls, but we still want to test that the tool was called
     if tool_calls:
diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py
new file mode 100644
index 000000000..759556257
--- /dev/null
+++ b/tests/integration/inference/test_openai_embeddings.py
@@ -0,0 +1,275 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import struct
+
+import pytest
+from openai import OpenAI
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+
+def decode_base64_to_floats(base64_string: str) -> list[float]:
+    """Helper function to decode base64 string to list of float32 values."""
+    embedding_bytes = base64.b64decode(base64_string)
+    float_count = len(embedding_bytes) // 4  # 4 bytes per float32
+    embedding_floats = struct.unpack(f"{float_count}f", embedding_bytes)
+    return list(embedding_floats)
+
+
+def provider_from_model(client_with_models, model_id):
+    models = {m.identifier: m for m in client_with_models.models.list()}
+    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
+    provider_id = models[model_id].provider_id
+    providers = {p.provider_id: p for p in client_with_models.providers.list()}
+    return providers[provider_id]
+
+
+def skip_if_model_doesnt_support_variable_dimensions(model_id):
+    if "text-embedding-3" not in model_id:
+        pytest.skip("{model_id} does not support variable output embedding dimensions")
+
+
+def skip_if_model_doesnt_support_openai_embeddings(client_with_models, model_id):
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI embeddings are not supported when testing with library client yet.")
+
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "inline::meta-reference",
+        "remote::bedrock",
+        "remote::cerebras",
+        "remote::databricks",
+        "remote::runpod",
+        "remote::sambanova",
+        "remote::tgi",
+        "remote::ollama",
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")
+
+
+@pytest.fixture
+def openai_client(client_with_models):
+    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+    return OpenAI(base_url=base_url, api_key="fake")
+
+
+def test_openai_embeddings_single_string(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with a single string input."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_text = "Hello, world!"
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text,
+        encoding_format="float",
+    )
+
+    assert response.object == "list"
+    assert response.model == embedding_model_id
+    assert len(response.data) == 1
+    assert response.data[0].object == "embedding"
+    assert response.data[0].index == 0
+    assert isinstance(response.data[0].embedding, list)
+    assert len(response.data[0].embedding) > 0
+    assert all(isinstance(x, float) for x in response.data[0].embedding)
+
+
+def test_openai_embeddings_multiple_strings(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with multiple string inputs."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_texts = ["Hello, world!", "How are you today?", "This is a test."]
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_texts,
+    )
+
+    assert response.object == "list"
+    assert response.model == embedding_model_id
+    assert len(response.data) == len(input_texts)
+
+    for i, embedding_data in enumerate(response.data):
+        assert embedding_data.object == "embedding"
+        assert embedding_data.index == i
+        assert isinstance(embedding_data.embedding, list)
+        assert len(embedding_data.embedding) > 0
+        assert all(isinstance(x, float) for x in embedding_data.embedding)
+
+
+def test_openai_embeddings_with_encoding_format_float(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with float encoding format."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_text = "Test encoding format"
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text,
+        encoding_format="float",
+    )
+
+    assert response.object == "list"
+    assert len(response.data) == 1
+    assert isinstance(response.data[0].embedding, list)
+    assert all(isinstance(x, float) for x in response.data[0].embedding)
+
+
+def test_openai_embeddings_with_dimensions(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with custom dimensions parameter."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+
+    input_text = "Test dimensions parameter"
+    dimensions = 16
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text,
+        dimensions=dimensions,
+    )
+
+    assert response.object == "list"
+    assert len(response.data) == 1
+    # Note: Not all models support custom dimensions, so we don't assert the exact dimension
+    assert isinstance(response.data[0].embedding, list)
+    assert len(response.data[0].embedding) > 0
+
+
+def test_openai_embeddings_with_user_parameter(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with user parameter."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_text = "Test user parameter"
+    user_id = "test-user-123"
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text,
+        user=user_id,
+    )
+
+    assert response.object == "list"
+    assert len(response.data) == 1
+    assert isinstance(response.data[0].embedding, list)
+    assert len(response.data[0].embedding) > 0
+
+
+def test_openai_embeddings_empty_list_error(openai_client, client_with_models, embedding_model_id):
+    """Test that empty list input raises an appropriate error."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    with pytest.raises(Exception):  # noqa: B017
+        openai_client.embeddings.create(
+            model=embedding_model_id,
+            input=[],
+        )
+
+
+def test_openai_embeddings_invalid_model_error(openai_client, client_with_models, embedding_model_id):
+    """Test that invalid model ID raises an appropriate error."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    with pytest.raises(Exception):  # noqa: B017
+        openai_client.embeddings.create(
+            model="invalid-model-id",
+            input="Test text",
+        )
+
+
+def test_openai_embeddings_different_inputs_different_outputs(openai_client, client_with_models, embedding_model_id):
+    """Test that different inputs produce different embeddings."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_text1 = "This is the first text"
+    input_text2 = "This is completely different content"
+
+    response1 = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text1,
+    )
+
+    response2 = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text2,
+    )
+
+    embedding1 = response1.data[0].embedding
+    embedding2 = response2.data[0].embedding
+
+    assert len(embedding1) == len(embedding2)
+    # Embeddings should be different for different inputs
+    assert embedding1 != embedding2
+
+
+def test_openai_embeddings_with_encoding_format_base64(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with base64 encoding format."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+
+    input_text = "Test base64 encoding format"
+    dimensions = 12
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_text,
+        encoding_format="base64",
+        dimensions=dimensions,
+    )
+
+    # Validate response structure
+    assert response.object == "list"
+    assert len(response.data) == 1
+
+    # With base64 encoding, embedding should be a string, not a list
+    embedding_data = response.data[0]
+    assert embedding_data.object == "embedding"
+    assert embedding_data.index == 0
+    assert isinstance(embedding_data.embedding, str)
+
+    # Verify it's valid base64 and decode to floats
+    embedding_floats = decode_base64_to_floats(embedding_data.embedding)
+
+    # Verify we got valid floats
+    assert len(embedding_floats) == dimensions, f"Got embedding length {len(embedding_floats)}, expected {dimensions}"
+    assert all(isinstance(x, float) for x in embedding_floats)
+
+
+def test_openai_embeddings_base64_batch_processing(openai_client, client_with_models, embedding_model_id):
+    """Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
+    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+
+    input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]
+
+    response = openai_client.embeddings.create(
+        model=embedding_model_id,
+        input=input_texts,
+        encoding_format="base64",
+    )
+
+    # Validate response structure
+    assert response.object == "list"
+    assert response.model == embedding_model_id
+    assert len(response.data) == len(input_texts)
+
+    # Validate each embedding in the batch
+    embedding_dimensions = []
+    for i, embedding_data in enumerate(response.data):
+        assert embedding_data.object == "embedding"
+        assert embedding_data.index == i
+
+        # With base64 encoding, embedding should be a string, not a list
+        assert isinstance(embedding_data.embedding, str)
+        embedding_floats = decode_base64_to_floats(embedding_data.embedding)
+        assert len(embedding_floats) > 0
+        assert all(isinstance(x, float) for x in embedding_floats)
+        embedding_dimensions.append(len(embedding_floats))
+
+    # All embeddings should have the same dimensionality
+    assert all(dim == embedding_dimensions[0] for dim in embedding_dimensions)
diff --git a/tests/integration/vector_io/test_vector_io.py b/tests/integration/vector_io/test_vector_io.py
index 90cb00313..f1cac9701 100644
--- a/tests/integration/vector_io/test_vector_io.py
+++ b/tests/integration/vector_io/test_vector_io.py
@@ -120,3 +120,37 @@ def test_insert_chunks(client_with_empty_registry, embedding_model_id, sample_ch
     top_match = response.chunks[0]
     assert top_match is not None
     assert top_match.metadata["document_id"] == expected_doc_id, f"Query '{query}' should match {expected_doc_id}"
+
+
+def test_insert_chunks_with_precomputed_embeddings(client_with_empty_registry, embedding_model_id):
+    vector_db_id = "test_precomputed_embeddings_db"
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=384,
+    )
+
+    chunks_with_embeddings = [
+        Chunk(
+            content="This is a test chunk with precomputed embedding.",
+            metadata={"document_id": "doc1", "source": "precomputed"},
+            embedding=[0.1] * 384,
+        ),
+    ]
+
+    client_with_empty_registry.vector_io.insert(
+        vector_db_id=vector_db_id,
+        chunks=chunks_with_embeddings,
+    )
+
+    # Query for the first document
+    response = client_with_empty_registry.vector_io.query(
+        vector_db_id=vector_db_id,
+        query="precomputed embedding test",
+    )
+
+    # Verify the top result is the expected document
+    assert response is not None
+    assert len(response.chunks) > 0
+    assert response.chunks[0].metadata["document_id"] == "doc1"
+    assert response.chunks[0].metadata["source"] == "precomputed"
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index 2a30fd0b8..9cbdc8e51 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -121,7 +121,7 @@ class ToolGroupsImpl(Impl):
 
 @pytest.mark.asyncio
 async def test_models_routing_table(cached_disk_dist_registry):
-    table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry)
+    table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple models and verify listing
@@ -163,7 +163,7 @@ async def test_models_routing_table(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_shields_routing_table(cached_disk_dist_registry):
-    table = ShieldsRoutingTable({"test_provider": SafetyImpl()}, cached_disk_dist_registry)
+    table = ShieldsRoutingTable({"test_provider": SafetyImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple shields and verify listing
@@ -179,14 +179,14 @@ async def test_shields_routing_table(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_vectordbs_routing_table(cached_disk_dist_registry):
-    table = VectorDBsRoutingTable({"test_provider": VectorDBImpl()}, cached_disk_dist_registry)
+    table = VectorDBsRoutingTable({"test_provider": VectorDBImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
-    m_table = ModelsRoutingTable({"test_providere": InferenceImpl()}, cached_disk_dist_registry)
+    m_table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry, {})
     await m_table.initialize()
     await m_table.register_model(
         model_id="test-model",
-        provider_id="test_providere",
+        provider_id="test_provider",
         metadata={"embedding_dimension": 128},
         model_type=ModelType.embedding,
     )
@@ -209,7 +209,7 @@ async def test_vectordbs_routing_table(cached_disk_dist_registry):
 
 
 async def test_datasets_routing_table(cached_disk_dist_registry):
-    table = DatasetsRoutingTable({"localfs": DatasetsImpl()}, cached_disk_dist_registry)
+    table = DatasetsRoutingTable({"localfs": DatasetsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple datasets and verify listing
@@ -235,7 +235,7 @@ async def test_datasets_routing_table(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_scoring_functions_routing_table(cached_disk_dist_registry):
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry)
+    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple scoring functions and verify listing
@@ -261,7 +261,7 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_benchmarks_routing_table(cached_disk_dist_registry):
-    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry)
+    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple benchmarks and verify listing
@@ -279,7 +279,7 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
-    table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry)
+    table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
 
     # Register multiple tool groups and verify listing
diff --git a/tests/unit/files/__init__.py b/tests/unit/files/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/unit/files/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/unit/files/test_files.py b/tests/unit/files/test_files.py
new file mode 100644
index 000000000..32006cbff
--- /dev/null
+++ b/tests/unit/files/test_files.py
@@ -0,0 +1,334 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+import pytest_asyncio
+
+from llama_stack.apis.common.responses import Order
+from llama_stack.apis.files import OpenAIFilePurpose
+from llama_stack.providers.inline.files.localfs import (
+    LocalfsFilesImpl,
+    LocalfsFilesImplConfig,
+)
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
+
+
+class MockUploadFile:
+    """Mock UploadFile for testing file uploads."""
+
+    def __init__(self, content: bytes, filename: str, content_type: str = "text/plain"):
+        self.content = content
+        self.filename = filename
+        self.content_type = content_type
+
+    async def read(self):
+        return self.content
+
+
+@pytest_asyncio.fixture
+async def files_provider(tmp_path):
+    """Create a files provider with temporary storage for testing."""
+    storage_dir = tmp_path / "files"
+    db_path = tmp_path / "files_metadata.db"
+
+    config = LocalfsFilesImplConfig(
+        storage_dir=storage_dir.as_posix(), metadata_store=SqliteSqlStoreConfig(db_path=db_path.as_posix())
+    )
+
+    provider = LocalfsFilesImpl(config)
+    await provider.initialize()
+    yield provider
+
+
+@pytest.fixture
+def sample_text_file():
+    """Sample text file for testing."""
+    content = b"Hello, this is a test file for the OpenAI Files API!"
+    return MockUploadFile(content, "test.txt", "text/plain")
+
+
+@pytest.fixture
+def sample_json_file():
+    """Sample JSON file for testing."""
+    content = b'{"message": "Hello, World!", "type": "test"}'
+    return MockUploadFile(content, "data.json", "application/json")
+
+
+@pytest.fixture
+def large_file():
+    """Large file for testing file size handling."""
+    content = b"x" * 1024 * 1024  # 1MB file
+    return MockUploadFile(content, "large_file.bin", "application/octet-stream")
+
+
+class TestOpenAIFilesAPI:
+    """Test suite for OpenAI Files API endpoints."""
+
+    @pytest.mark.asyncio
+    async def test_upload_file_success(self, files_provider, sample_text_file):
+        """Test successful file upload."""
+        # Upload file
+        result = await files_provider.openai_upload_file(file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+
+        # Verify response
+        assert result.id.startswith("file-")
+        assert result.filename == "test.txt"
+        assert result.purpose == OpenAIFilePurpose.ASSISTANTS
+        assert result.bytes == len(sample_text_file.content)
+        assert result.created_at > 0
+        assert result.expires_at > result.created_at
+
+    @pytest.mark.asyncio
+    async def test_upload_different_purposes(self, files_provider, sample_text_file):
+        """Test uploading files with different purposes."""
+        purposes = list(OpenAIFilePurpose)
+
+        uploaded_files = []
+        for purpose in purposes:
+            result = await files_provider.openai_upload_file(file=sample_text_file, purpose=purpose)
+            uploaded_files.append(result)
+            assert result.purpose == purpose
+
+    @pytest.mark.asyncio
+    async def test_upload_different_file_types(self, files_provider, sample_text_file, sample_json_file, large_file):
+        """Test uploading different types and sizes of files."""
+        files_to_test = [
+            (sample_text_file, "test.txt"),
+            (sample_json_file, "data.json"),
+            (large_file, "large_file.bin"),
+        ]
+
+        for file_obj, expected_filename in files_to_test:
+            result = await files_provider.openai_upload_file(file=file_obj, purpose=OpenAIFilePurpose.ASSISTANTS)
+            assert result.filename == expected_filename
+            assert result.bytes == len(file_obj.content)
+
+    @pytest.mark.asyncio
+    async def test_list_files_empty(self, files_provider):
+        """Test listing files when no files exist."""
+        result = await files_provider.openai_list_files()
+
+        assert result.data == []
+        assert result.has_more is False
+        assert result.first_id == ""
+        assert result.last_id == ""
+
+    @pytest.mark.asyncio
+    async def test_list_files_with_content(self, files_provider, sample_text_file, sample_json_file):
+        """Test listing files when files exist."""
+        # Upload multiple files
+        file1 = await files_provider.openai_upload_file(file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+        file2 = await files_provider.openai_upload_file(file=sample_json_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+
+        # List files
+        result = await files_provider.openai_list_files()
+
+        assert len(result.data) == 2
+        file_ids = [f.id for f in result.data]
+        assert file1.id in file_ids
+        assert file2.id in file_ids
+
+    @pytest.mark.asyncio
+    async def test_list_files_with_purpose_filter(self, files_provider, sample_text_file):
+        """Test listing files with purpose filtering."""
+        # Upload file with specific purpose
+        uploaded_file = await files_provider.openai_upload_file(
+            file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        # List files with matching purpose
+        result = await files_provider.openai_list_files(purpose=OpenAIFilePurpose.ASSISTANTS)
+        assert len(result.data) == 1
+        assert result.data[0].id == uploaded_file.id
+        assert result.data[0].purpose == OpenAIFilePurpose.ASSISTANTS
+
+    @pytest.mark.asyncio
+    async def test_list_files_with_limit(self, files_provider, sample_text_file):
+        """Test listing files with limit parameter."""
+        # Upload multiple files
+        for _ in range(5):
+            await files_provider.openai_upload_file(file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+
+        # List with limit
+        result = await files_provider.openai_list_files(limit=3)
+        assert len(result.data) == 3
+
+    @pytest.mark.asyncio
+    async def test_list_files_with_order(self, files_provider, sample_text_file):
+        """Test listing files with different order."""
+        # Upload multiple files
+        files = []
+        for _ in range(3):
+            file = await files_provider.openai_upload_file(file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+            files.append(file)
+
+        # Test descending order (default)
+        result_desc = await files_provider.openai_list_files(order=Order.desc)
+        assert len(result_desc.data) == 3
+        # Most recent should be first
+        assert result_desc.data[0].created_at >= result_desc.data[1].created_at >= result_desc.data[2].created_at
+
+        # Test ascending order
+        result_asc = await files_provider.openai_list_files(order=Order.asc)
+        assert len(result_asc.data) == 3
+        # Oldest should be first
+        assert result_asc.data[0].created_at <= result_asc.data[1].created_at <= result_asc.data[2].created_at
+
+    @pytest.mark.asyncio
+    async def test_retrieve_file_success(self, files_provider, sample_text_file):
+        """Test successful file retrieval."""
+        # Upload file
+        uploaded_file = await files_provider.openai_upload_file(
+            file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        # Retrieve file
+        retrieved_file = await files_provider.openai_retrieve_file(uploaded_file.id)
+
+        # Verify response
+        assert retrieved_file.id == uploaded_file.id
+        assert retrieved_file.filename == uploaded_file.filename
+        assert retrieved_file.purpose == uploaded_file.purpose
+        assert retrieved_file.bytes == uploaded_file.bytes
+        assert retrieved_file.created_at == uploaded_file.created_at
+        assert retrieved_file.expires_at == uploaded_file.expires_at
+
+    @pytest.mark.asyncio
+    async def test_retrieve_file_not_found(self, files_provider):
+        """Test retrieving a non-existent file."""
+        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+            await files_provider.openai_retrieve_file("file-nonexistent")
+
+    @pytest.mark.asyncio
+    async def test_retrieve_file_content_success(self, files_provider, sample_text_file):
+        """Test successful file content retrieval."""
+        # Upload file
+        uploaded_file = await files_provider.openai_upload_file(
+            file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        # Retrieve file content
+        content = await files_provider.openai_retrieve_file_content(uploaded_file.id)
+
+        # Verify content
+        assert content.body == sample_text_file.content
+
+    @pytest.mark.asyncio
+    async def test_retrieve_file_content_not_found(self, files_provider):
+        """Test retrieving content of a non-existent file."""
+        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+            await files_provider.openai_retrieve_file_content("file-nonexistent")
+
+    @pytest.mark.asyncio
+    async def test_delete_file_success(self, files_provider, sample_text_file):
+        """Test successful file deletion."""
+        # Upload file
+        uploaded_file = await files_provider.openai_upload_file(
+            file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        # Verify file exists
+        await files_provider.openai_retrieve_file(uploaded_file.id)
+
+        # Delete file
+        delete_response = await files_provider.openai_delete_file(uploaded_file.id)
+
+        # Verify delete response
+        assert delete_response.id == uploaded_file.id
+        assert delete_response.deleted is True
+
+        # Verify file no longer exists
+        with pytest.raises(ValueError, match=f"File with id {uploaded_file.id} not found"):
+            await files_provider.openai_retrieve_file(uploaded_file.id)
+
+    @pytest.mark.asyncio
+    async def test_delete_file_not_found(self, files_provider):
+        """Test deleting a non-existent file."""
+        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+            await files_provider.openai_delete_file("file-nonexistent")
+
+    @pytest.mark.asyncio
+    async def test_file_persistence_across_operations(self, files_provider, sample_text_file):
+        """Test that files persist correctly across multiple operations."""
+        # Upload file
+        uploaded_file = await files_provider.openai_upload_file(
+            file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        # Verify it appears in listing
+        files_list = await files_provider.openai_list_files()
+        assert len(files_list.data) == 1
+        assert files_list.data[0].id == uploaded_file.id
+
+        # Retrieve file info
+        retrieved_file = await files_provider.openai_retrieve_file(uploaded_file.id)
+        assert retrieved_file.id == uploaded_file.id
+
+        # Retrieve file content
+        content = await files_provider.openai_retrieve_file_content(uploaded_file.id)
+        assert content.body == sample_text_file.content
+
+        # Delete file
+        await files_provider.openai_delete_file(uploaded_file.id)
+
+        # Verify it's gone from listing
+        files_list = await files_provider.openai_list_files()
+        assert len(files_list.data) == 0
+
+    @pytest.mark.asyncio
+    async def test_multiple_files_operations(self, files_provider, sample_text_file, sample_json_file):
+        """Test operations with multiple files."""
+        # Upload multiple files
+        file1 = await files_provider.openai_upload_file(file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+        file2 = await files_provider.openai_upload_file(file=sample_json_file, purpose=OpenAIFilePurpose.ASSISTANTS)
+
+        # Verify both exist
+        files_list = await files_provider.openai_list_files()
+        assert len(files_list.data) == 2
+
+        # Delete one file
+        await files_provider.openai_delete_file(file1.id)
+
+        # Verify only one remains
+        files_list = await files_provider.openai_list_files()
+        assert len(files_list.data) == 1
+        assert files_list.data[0].id == file2.id
+
+        # Verify the remaining file is still accessible
+        content = await files_provider.openai_retrieve_file_content(file2.id)
+        assert content.body == sample_json_file.content
+
+    @pytest.mark.asyncio
+    async def test_file_id_uniqueness(self, files_provider, sample_text_file):
+        """Test that each uploaded file gets a unique ID."""
+        file_ids = set()
+
+        # Upload same file multiple times
+        for _ in range(10):
+            uploaded_file = await files_provider.openai_upload_file(
+                file=sample_text_file, purpose=OpenAIFilePurpose.ASSISTANTS
+            )
+            assert uploaded_file.id not in file_ids, f"Duplicate file ID: {uploaded_file.id}"
+            file_ids.add(uploaded_file.id)
+            assert uploaded_file.id.startswith("file-")
+
+    @pytest.mark.asyncio
+    async def test_file_no_filename_handling(self, files_provider):
+        """Test handling files with no filename."""
+        file_without_name = MockUploadFile(b"content", None)  # No filename
+
+        uploaded_file = await files_provider.openai_upload_file(
+            file=file_without_name, purpose=OpenAIFilePurpose.ASSISTANTS
+        )
+
+        assert uploaded_file.filename == "uploaded_file"  # Default filename
+
+    @pytest.mark.asyncio
+    async def test_after_pagination_not_implemented(self, files_provider):
+        """Test that 'after' pagination raises NotImplementedError."""
+        with pytest.raises(NotImplementedError, match="After pagination not yet implemented"):
+            await files_provider.openai_list_files(after="file-some-id")
diff --git a/tests/unit/models/llama/test_tokenizer_utils.py b/tests/unit/models/llama/test_tokenizer_utils.py
new file mode 100644
index 000000000..57fc346e1
--- /dev/null
+++ b/tests/unit/models/llama/test_tokenizer_utils.py
@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import time
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from tiktoken.load import load_tiktoken_bpe
+
+from llama_stack.models.llama.tokenizer_utils import load_bpe_file
+
+
+@pytest.fixture
+def test_bpe_content():
+    """Sample BPE file content for testing."""
+    return """wA== 0
+wQ== 1
+9Q== 2
+9g== 3
+9w== 4
++A== 5
++Q== 6
++g== 7
++w== 8
+/A== 9
+/Q== 10
+/g== 11
+/w== 12
+AA== 13
+AQ== 14"""
+
+
+@pytest.fixture
+def test_bpe_file(tmp_path, test_bpe_content):
+    """Create a temporary BPE file for testing."""
+    bpe_file = tmp_path / "test_tokenizer.model"
+    bpe_file.write_text(test_bpe_content, encoding="utf-8")
+    return bpe_file
+
+
+@pytest.fixture
+def llama3_model_path():
+    """Path to Llama3 tokenizer model."""
+    return Path(__file__).parent / "../../../../llama_stack/models/llama/llama3/tokenizer.model"
+
+
+@pytest.fixture
+def llama4_model_path():
+    """Path to Llama4 tokenizer model."""
+    return Path(__file__).parent / "../../../../llama_stack/models/llama/llama4/tokenizer.model"
+
+
+def test_load_bpe_file_basic_functionality(test_bpe_file):
+    """Test that load_bpe_file correctly parses BPE files."""
+    result = load_bpe_file(test_bpe_file)
+
+    for key, value in result.items():
+        assert isinstance(key, bytes)
+        assert isinstance(value, int)
+
+    assert len(result) == 15
+
+    expected_first_token = base64.b64decode("wA==")
+    assert expected_first_token in result
+    assert result[expected_first_token] == 0
+
+
+def test_load_bpe_file_vs_tiktoken_with_real_model(llama3_model_path):
+    """Test that our implementation produces identical results to tiktoken on real model files."""
+    if not llama3_model_path.exists():
+        pytest.skip("Llama3 tokenizer model not found")
+
+    our_result = load_bpe_file(llama3_model_path)
+    tiktoken_result = load_tiktoken_bpe(llama3_model_path.as_posix())
+
+    # Compare results from our implementation and tiktoken
+    assert len(our_result) == len(tiktoken_result)
+    assert our_result == tiktoken_result
+
+    assert len(our_result) > 100000
+    ranks = list(our_result.values())
+    assert len(ranks) == len(set(ranks))
+
+
+def test_load_bpe_file_vs_tiktoken_with_llama4_model(llama4_model_path):
+    """Test that our implementation produces identical results to tiktoken on Llama4 model."""
+    if not llama4_model_path.exists():
+        pytest.skip("Llama4 tokenizer model not found")
+
+    our_result = load_bpe_file(llama4_model_path)
+    tiktoken_result = load_tiktoken_bpe(llama4_model_path.as_posix())
+
+    # Compare results from our implementation and tiktoken
+    assert len(our_result) == len(tiktoken_result)
+    assert our_result == tiktoken_result
+
+    assert len(our_result) > 100000
+    ranks = list(our_result.values())
+    assert len(ranks) == len(set(ranks))
+
+
+def test_load_bpe_file_malformed_lines(tmp_path):
+    """Test that load_bpe_file handles malformed lines gracefully."""
+    malformed_content = """wA== 0
+invalid_line_without_rank
+wQ== 1
+invalid_base64!!! 2
+9Q== 2"""
+
+    test_file = tmp_path / "malformed.model"
+    test_file.write_text(malformed_content, encoding="utf-8")
+
+    with patch("llama_stack.models.llama.tokenizer_utils.logger") as mock_logger:
+        result = load_bpe_file(test_file)
+
+        # Should have 3 valid entries (skipping malformed ones)
+        assert len(result) == 3
+
+        # Should have logged warnings for malformed lines
+        assert mock_logger.warning.called
+        assert mock_logger.warning.call_count > 0
+
+
+def test_load_bpe_file_nonexistent_file():
+    """Test that load_bpe_file raises appropriate error for nonexistent files."""
+    with pytest.raises(FileNotFoundError):
+        load_bpe_file("/nonexistent/path/to/file.model")
+
+
+def test_tokenizer_integration():
+    """Test that our load_bpe_file works correctly when used in actual tokenizers."""
+    try:
+        from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
+
+        tokenizer = Llama3Tokenizer.get_instance()
+
+        # Test basic functionality
+        test_text = "Hello, world! This is a test."
+        tokens = tokenizer.encode(test_text, bos=False, eos=False)
+        decoded = tokenizer.decode(tokens)
+
+        assert test_text == decoded
+        assert isinstance(tokens, list)
+        assert all(isinstance(token, int) for token in tokens)
+
+    except Exception as e:
+        pytest.skip(f"Llama3 tokenizer not available: {e}")
+
+
+def test_performance_comparison(llama3_model_path):
+    """Test that our implementation has reasonable performance compared to tiktoken."""
+    if not llama3_model_path.exists():
+        pytest.skip("Llama3 tokenizer model not found")
+
+    # Time our implementation
+    start_time = time.time()
+    our_result = load_bpe_file(llama3_model_path)
+    our_time = time.time() - start_time
+
+    # Time tiktoken implementation
+    start_time = time.time()
+    tiktoken_result = load_tiktoken_bpe(llama3_model_path.as_posix())
+    tiktoken_time = time.time() - start_time
+
+    # Verify results are identical
+    assert our_result == tiktoken_result
+
+    # Our implementation should be reasonably fast (within 10x of tiktoken)
+    # This is a loose bound since we're optimizing for correctness, not speed
+    assert our_time < tiktoken_time * 10, f"Our implementation took {our_time:.3f}s vs tiktoken's {tiktoken_time:.3f}s"
+
+    print(f"Performance comparison - Our: {our_time:.3f}s, Tiktoken: {tiktoken_time:.3f}s")
diff --git a/tests/unit/providers/agent/test_meta_reference_agent.py b/tests/unit/providers/agent/test_meta_reference_agent.py
index 9549f6df6..7a7d52892 100644
--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@@ -59,6 +59,7 @@ async def agents_impl(config, mock_apis):
         mock_apis["safety_api"],
         mock_apis["tool_runtime_api"],
         mock_apis["tool_groups_api"],
+        {},
     )
     await impl.initialize()
     yield impl
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 5b6cee0ec..34f22c39f 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -25,11 +25,17 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseObjectWithInput,
     OpenAIResponseOutputMessageContentOutputText,
     OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseText,
+    OpenAIResponseTextFormat,
 )
 from llama_stack.apis.inference.inference import (
     OpenAIAssistantMessageParam,
     OpenAIChatCompletionContentPartTextParam,
     OpenAIDeveloperMessageParam,
+    OpenAIJSONSchema,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatText,
     OpenAIUserMessageParam,
 )
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
@@ -74,6 +80,37 @@ def openai_responses_impl(mock_inference_api, mock_tool_groups_api, mock_tool_ru
     )
 
 
+async def fake_stream(fixture: str = "simple_chat_completion.yaml"):
+    value = load_chat_completion_fixture(fixture)
+    yield ChatCompletionChunk(
+        id=value.id,
+        choices=[
+            Choice(
+                index=0,
+                delta=ChoiceDelta(
+                    content=c.message.content,
+                    role=c.message.role,
+                    tool_calls=[
+                        ChoiceDeltaToolCall(
+                            index=0,
+                            id=t.id,
+                            function=ChoiceDeltaToolCallFunction(
+                                name=t.function.name,
+                                arguments=t.function.arguments,
+                            ),
+                        )
+                        for t in (c.message.tool_calls or [])
+                    ],
+                ),
+            )
+            for c in value.choices
+        ],
+        created=1,
+        model=value.model,
+        object="chat.completion.chunk",
+    )
+
+
 @pytest.mark.asyncio
 async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
     """Test creating an OpenAI response with a simple string input."""
@@ -82,8 +119,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
     # Load the chat completion fixture
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute
     result = await openai_responses_impl.create_openai_response(
@@ -96,8 +132,9 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
     mock_inference_api.openai_chat_completion.assert_called_once_with(
         model=model,
         messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
+        response_format=OpenAIResponseFormatText(),
         tools=None,
-        stream=False,
+        stream=True,
         temperature=0.1,
     )
     openai_responses_impl.responses_store.store_response_object.assert_called_once()
@@ -114,20 +151,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
     input_text = "What is the capital of Ireland?"
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
-    # Load the chat completion fixtures
-    tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
-    tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-
     mock_inference_api.openai_chat_completion.side_effect = [
-        tool_call_completion,
-        tool_response_completion,
+        fake_stream("tool_call_completion.yaml"),
+        fake_stream(),
     ]
 
     openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
         identifier="web_search",
         provider_id="client",
         toolgroup_id="web_search",
-        tool_host="client",
         description="Search the web for information",
         parameters=[
             ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True)
@@ -182,7 +214,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
     input_text = "How hot it is in San Francisco today?"
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
-    async def fake_stream():
+    async def fake_stream_toolcall():
         yield ChatCompletionChunk(
             id="123",
             choices=[
@@ -205,7 +237,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
             object="chat.completion.chunk",
         )
 
-    mock_inference_api.openai_chat_completion.return_value = fake_stream()
+    mock_inference_api.openai_chat_completion.return_value = fake_stream_toolcall()
 
     # Execute
     result = await openai_responses_impl.create_openai_response(
@@ -224,16 +256,16 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
         ],
     )
 
-    # Verify
+    # Check that we got the content from our mocked tool execution result
+    chunks = [chunk async for chunk in result]
+    assert len(chunks) == 2  # Should have response.created and response.completed
+
+    # Verify inference API was called correctly (after iterating over result)
     first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
     assert first_call.kwargs["messages"][0].content == input_text
     assert first_call.kwargs["tools"] is not None
     assert first_call.kwargs["temperature"] == 0.1
 
-    # Check that we got the content from our mocked tool execution result
-    chunks = [chunk async for chunk in result]
-    assert len(chunks) == 2  # Should have response.created and response.completed
-
     # Check response.created event (should have empty output)
     assert chunks[0].type == "response.created"
     assert len(chunks[0].response.output) == 0
@@ -264,7 +296,7 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im
     ]
     model = "meta-llama/Llama-3.1-8B-Instruct"
 
-    mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml")
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute
     await openai_responses_impl.create_openai_response(
@@ -320,6 +352,7 @@ async def test_prepend_previous_response_basic(openai_responses_impl, mock_respo
         model="fake_model",
         output=[response_output_message],
         status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
         input=[input_item_message],
     )
     mock_responses_store.get_response_object.return_value = previous_response
@@ -362,6 +395,7 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_
         model="fake_model",
         output=[output_web_search, output_message],
         status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
         input=[input_item_message],
     )
     mock_responses_store.get_response_object.return_value = response
@@ -390,9 +424,7 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m
     model = "meta-llama/Llama-3.1-8B-Instruct"
     instructions = "You are a geography expert. Provide concise answers."
 
-    # Load the chat completion fixture
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute
     await openai_responses_impl.create_openai_response(
@@ -431,8 +463,7 @@ async def test_create_openai_response_with_instructions_and_multiple_messages(
     model = "meta-llama/Llama-3.1-8B-Instruct"
     instructions = "You are a geography expert. Provide concise answers."
 
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute
     await openai_responses_impl.create_openai_response(
@@ -483,14 +514,15 @@ async def test_create_openai_response_with_instructions_and_previous_response(
         model="fake_model",
         output=[response_output_message],
         status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
         input=[input_item_message],
     )
     mock_responses_store.get_response_object.return_value = response
 
     model = "meta-llama/Llama-3.1-8B-Instruct"
     instructions = "You are a geography expert. Provide concise answers."
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute
     await openai_responses_impl.create_openai_response(
@@ -576,6 +608,7 @@ async def test_responses_store_list_input_items_logic():
         object="response",
         status="completed",
         output=[],
+        text=OpenAIResponseText(format=(OpenAIResponseTextFormat(type="text"))),
         input=input_items,
     )
 
@@ -644,6 +677,7 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
         created_at=1234567890,
         model="meta-llama/Llama-3.1-8B-Instruct",
         status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
         input=[
             OpenAIResponseMessage(
                 id="msg-prev-user", role="user", content=[OpenAIResponseInputMessageContentText(text="What is 2+2?")]
@@ -662,8 +696,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
 
     current_input = "Now what is 3+3?"
     model = "meta-llama/Llama-3.1-8B-Instruct"
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
 
     # Execute - Create response with previous_response_id
     result = await openai_responses_impl.create_openai_response(
@@ -694,3 +728,59 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
     # Verify the response itself is correct
     assert result.model == model
     assert result.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "text_format, response_format",
+    [
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")), OpenAIResponseFormatText()),
+        (
+            OpenAIResponseText(format=OpenAIResponseTextFormat(name="Test", schema={"foo": "bar"}, type="json_schema")),
+            OpenAIResponseFormatJSONSchema(json_schema=OpenAIJSONSchema(name="Test", schema={"foo": "bar"})),
+        ),
+        (OpenAIResponseText(format=OpenAIResponseTextFormat(type="json_object")), OpenAIResponseFormatJSONObject()),
+        # ensure text param with no format specified defaults to text
+        (OpenAIResponseText(format=None), OpenAIResponseFormatText()),
+        # ensure text param of None defaults to text
+        (None, OpenAIResponseFormatText()),
+    ],
+)
+async def test_create_openai_response_with_text_format(
+    openai_responses_impl, mock_inference_api, text_format, response_format
+):
+    """Test creating Responses with text formats."""
+    # Setup
+    input_text = "How hot it is in San Francisco today?"
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
+
+    # Execute
+    _result = await openai_responses_impl.create_openai_response(
+        input=input_text,
+        model=model,
+        text=text_format,
+    )
+
+    # Verify
+    first_call = mock_inference_api.openai_chat_completion.call_args_list[0]
+    assert first_call.kwargs["messages"][0].content == input_text
+    assert first_call.kwargs["response_format"] is not None
+    assert first_call.kwargs["response_format"] == response_format
+
+
+@pytest.mark.asyncio
+async def test_create_openai_response_with_invalid_text_format(openai_responses_impl, mock_inference_api):
+    """Test creating an OpenAI response with an invalid text format."""
+    # Setup
+    input_text = "How hot it is in San Francisco today?"
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+
+    # Execute
+    with pytest.raises(ValueError):
+        _result = await openai_responses_impl.create_openai_response(
+            input=input_text,
+            model=model,
+            text=OpenAIResponseText(format={"type": "invalid"}),
+        )
diff --git a/tests/unit/providers/agents/test_persistence_access_control.py b/tests/unit/providers/agents/test_persistence_access_control.py
index 48fa647a8..d5b876a09 100644
--- a/tests/unit/providers/agents/test_persistence_access_control.py
+++ b/tests/unit/providers/agents/test_persistence_access_control.py
@@ -12,24 +12,24 @@ import pytest
 
 from llama_stack.apis.agents import Turn
 from llama_stack.apis.inference import CompletionMessage, StopReason
-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import User
 from llama_stack.providers.inline.agents.meta_reference.persistence import AgentPersistence, AgentSessionInfo
 
 
 @pytest.fixture
 async def test_setup(sqlite_kvstore):
-    agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=sqlite_kvstore)
+    agent_persistence = AgentPersistence(agent_id="test_agent", kvstore=sqlite_kvstore, policy={})
     yield agent_persistence
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_session_creation_with_access_attributes(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_session_creation_with_access_attributes(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
 
     # Set creator's attributes for the session
     creator_attributes = {"roles": ["researcher"], "teams": ["ai-team"]}
-    mock_get_auth_attributes.return_value = creator_attributes
+    mock_get_authenticated_user.return_value = User("test_user", creator_attributes)
 
     # Create a session
     session_id = await agent_persistence.create_session("Test Session")
@@ -37,14 +37,15 @@ async def test_session_creation_with_access_attributes(mock_get_auth_attributes,
     # Get the session and verify access attributes were set
     session_info = await agent_persistence.get_session_info(session_id)
     assert session_info is not None
-    assert session_info.access_attributes is not None
-    assert session_info.access_attributes.roles == ["researcher"]
-    assert session_info.access_attributes.teams == ["ai-team"]
+    assert session_info.owner is not None
+    assert session_info.owner.attributes is not None
+    assert session_info.owner.attributes["roles"] == ["researcher"]
+    assert session_info.owner.attributes["teams"] == ["ai-team"]
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_session_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_session_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
 
     # Create a session with specific access attributes
@@ -53,8 +54,9 @@ async def test_session_access_control(mock_get_auth_attributes, test_setup):
         session_id=session_id,
         session_name="Restricted Session",
         started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"], teams=["security-team"]),
+        owner=User("someone", {"roles": ["admin"], "teams": ["security-team"]}),
         turns=[],
+        identifier="Restricted Session",
     )
 
     await agent_persistence.kvstore.set(
@@ -63,20 +65,22 @@ async def test_session_access_control(mock_get_auth_attributes, test_setup):
     )
 
     # User with matching attributes can access
-    mock_get_auth_attributes.return_value = {"roles": ["admin", "user"], "teams": ["security-team", "other-team"]}
+    mock_get_authenticated_user.return_value = User(
+        "testuser", {"roles": ["admin", "user"], "teams": ["security-team", "other-team"]}
+    )
     retrieved_session = await agent_persistence.get_session_info(session_id)
     assert retrieved_session is not None
     assert retrieved_session.session_id == session_id
 
     # User without matching attributes cannot access
-    mock_get_auth_attributes.return_value = {"roles": ["user"], "teams": ["other-team"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"], "teams": ["other-team"]})
     retrieved_session = await agent_persistence.get_session_info(session_id)
     assert retrieved_session is None
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_turn_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_turn_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
 
     # Create a session with restricted access
@@ -85,8 +89,9 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
         session_id=session_id,
         session_name="Restricted Session",
         started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("someone", {"roles": ["admin"]}),
         turns=[],
+        identifier="Restricted Session",
     )
 
     await agent_persistence.kvstore.set(
@@ -109,7 +114,7 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
     )
 
     # Admin can add turn
-    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["admin"]})
     await agent_persistence.add_turn_to_session(session_id, turn)
 
     # Admin can get turn
@@ -118,7 +123,7 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
     assert retrieved_turn.turn_id == turn_id
 
     # Regular user cannot get turn
-    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"]})
     with pytest.raises(ValueError):
         await agent_persistence.get_session_turn(session_id, turn_id)
 
@@ -128,8 +133,8 @@ async def test_turn_access_control(mock_get_auth_attributes, test_setup):
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_auth_attributes")
-async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.providers.inline.agents.meta_reference.persistence.get_authenticated_user")
+async def test_tool_call_and_infer_iters_access_control(mock_get_authenticated_user, test_setup):
     agent_persistence = test_setup
 
     # Create a session with restricted access
@@ -138,8 +143,9 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
         session_id=session_id,
         session_name="Restricted Session",
         started_at=datetime.now(),
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("someone", {"roles": ["admin"]}),
         turns=[],
+        identifier="Restricted Session",
     )
 
     await agent_persistence.kvstore.set(
@@ -150,7 +156,7 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
     turn_id = str(uuid.uuid4())
 
     # Admin user can set inference iterations
-    mock_get_auth_attributes.return_value = {"roles": ["admin"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["admin"]})
     await agent_persistence.set_num_infer_iters_in_turn(session_id, turn_id, 5)
 
     # Admin user can get inference iterations
@@ -158,7 +164,7 @@ async def test_tool_call_and_infer_iters_access_control(mock_get_auth_attributes
     assert infer_iters == 5
 
     # Regular user cannot get inference iterations
-    mock_get_auth_attributes.return_value = {"roles": ["user"]}
+    mock_get_authenticated_user.return_value = User("testuser", {"roles": ["user"]})
     infer_iters = await agent_persistence.get_num_infer_iters_in_turn(session_id, turn_id)
     assert infer_iters is None
 
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index f9eaee7d6..17c867af1 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -69,9 +69,12 @@ class MockInferenceAdapterWithSleep:
             # ruff: noqa: N802
             def do_POST(self):
                 time.sleep(sleep_time)
+                response_body = json.dumps(response).encode("utf-8")
                 self.send_response(code=200)
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Content-Length", len(response_body))
                 self.end_headers()
-                self.wfile.write(json.dumps(response).encode("utf-8"))
+                self.wfile.write(response_body)
 
         self.request_handler = DelayedRequestHandler
 
diff --git a/tests/unit/providers/vector_io/test_qdrant.py b/tests/unit/providers/vector_io/test_qdrant.py
index 34df9b52f..607eccb24 100644
--- a/tests/unit/providers/vector_io/test_qdrant.py
+++ b/tests/unit/providers/vector_io/test_qdrant.py
@@ -50,6 +50,7 @@ def mock_vector_db(vector_db_id) -> MagicMock:
     mock_vector_db = MagicMock(spec=VectorDB)
     mock_vector_db.embedding_model = "embedding_model"
     mock_vector_db.identifier = vector_db_id
+    mock_vector_db.embedding_dimension = 384
     return mock_vector_db
 
 
diff --git a/tests/unit/rag/test_vector_store.py b/tests/unit/rag/test_vector_store.py
index f97808a6d..9d6b9ee67 100644
--- a/tests/unit/rag/test_vector_store.py
+++ b/tests/unit/rag/test_vector_store.py
@@ -8,11 +8,20 @@ import base64
 import mimetypes
 import os
 from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock
 
+import numpy as np
 import pytest
 
 from llama_stack.apis.tools import RAGDocument
-from llama_stack.providers.utils.memory.vector_store import URL, content_from_doc, make_overlapped_chunks
+from llama_stack.apis.vector_io import Chunk
+from llama_stack.providers.utils.memory.vector_store import (
+    URL,
+    VectorDBWithIndex,
+    _validate_embedding,
+    content_from_doc,
+    make_overlapped_chunks,
+)
 
 DUMMY_PDF_PATH = Path(os.path.abspath(__file__)).parent / "fixtures" / "dummy.pdf"
 # Depending on the machine, this can get parsed a couple of ways
@@ -36,6 +45,72 @@ def data_url_from_file(file_path: str) -> str:
     return data_url
 
 
+class TestChunk:
+    def test_chunk(self):
+        chunk = Chunk(
+            content="Example chunk content",
+            metadata={"key": "value"},
+            embedding=[0.1, 0.2, 0.3],
+        )
+
+        assert chunk.content == "Example chunk content"
+        assert chunk.metadata == {"key": "value"}
+        assert chunk.embedding == [0.1, 0.2, 0.3]
+
+        chunk_no_embedding = Chunk(
+            content="Example chunk content",
+            metadata={"key": "value"},
+        )
+        assert chunk_no_embedding.embedding is None
+
+
+class TestValidateEmbedding:
+    def test_valid_list_embeddings(self):
+        _validate_embedding([0.1, 0.2, 0.3], 0, 3)
+        _validate_embedding([1, 2, 3], 1, 3)
+        _validate_embedding([0.1, 2, 3.5], 2, 3)
+
+    def test_valid_numpy_embeddings(self):
+        _validate_embedding(np.array([0.1, 0.2, 0.3], dtype=np.float32), 0, 3)
+        _validate_embedding(np.array([0.1, 0.2, 0.3], dtype=np.float64), 1, 3)
+        _validate_embedding(np.array([1, 2, 3], dtype=np.int32), 2, 3)
+        _validate_embedding(np.array([1, 2, 3], dtype=np.int64), 3, 3)
+
+    def test_invalid_embedding_type(self):
+        error_msg = "must be a list or numpy array"
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding("not a list", 0, 3)
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding(None, 1, 3)
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding(42, 2, 3)
+
+    def test_non_numeric_values(self):
+        error_msg = "contains non-numeric values"
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding([0.1, "string", 0.3], 0, 3)
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding([0.1, None, 0.3], 1, 3)
+
+        with pytest.raises(ValueError, match=error_msg):
+            _validate_embedding([1, {}, 3], 2, 3)
+
+    def test_wrong_dimension(self):
+        with pytest.raises(ValueError, match="has dimension 4, expected 3"):
+            _validate_embedding([0.1, 0.2, 0.3, 0.4], 0, 3)
+
+        with pytest.raises(ValueError, match="has dimension 2, expected 3"):
+            _validate_embedding([0.1, 0.2], 1, 3)
+
+        with pytest.raises(ValueError, match="has dimension 0, expected 3"):
+            _validate_embedding([], 2, 3)
+
+
 class TestVectorStore:
     @pytest.mark.asyncio
     async def test_returns_content_from_pdf_data_uri(self):
@@ -126,3 +201,126 @@ class TestVectorStore:
         assert str(excinfo.value) == "Failed to serialize metadata to string"
         assert isinstance(excinfo.value.__cause__, TypeError)
         assert str(excinfo.value.__cause__) == "Cannot convert to string"
+
+
+class TestVectorDBWithIndex:
+    @pytest.mark.asyncio
+    async def test_insert_chunks_without_embeddings(self):
+        mock_vector_db = MagicMock()
+        mock_vector_db.embedding_model = "test-model without embeddings"
+        mock_index = AsyncMock()
+        mock_inference_api = AsyncMock()
+
+        vector_db_with_index = VectorDBWithIndex(
+            vector_db=mock_vector_db, index=mock_index, inference_api=mock_inference_api
+        )
+
+        chunks = [
+            Chunk(content="Test 1", embedding=None, metadata={}),
+            Chunk(content="Test 2", embedding=None, metadata={}),
+        ]
+
+        mock_inference_api.embeddings.return_value.embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
+
+        await vector_db_with_index.insert_chunks(chunks)
+
+        mock_inference_api.embeddings.assert_called_once_with("test-model without embeddings", ["Test 1", "Test 2"])
+        mock_index.add_chunks.assert_called_once()
+        args = mock_index.add_chunks.call_args[0]
+        assert args[0] == chunks
+        assert np.array_equal(args[1], np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=np.float32))
+
+    @pytest.mark.asyncio
+    async def test_insert_chunks_with_valid_embeddings(self):
+        mock_vector_db = MagicMock()
+        mock_vector_db.embedding_model = "test-model with embeddings"
+        mock_vector_db.embedding_dimension = 3
+        mock_index = AsyncMock()
+        mock_inference_api = AsyncMock()
+
+        vector_db_with_index = VectorDBWithIndex(
+            vector_db=mock_vector_db, index=mock_index, inference_api=mock_inference_api
+        )
+
+        chunks = [
+            Chunk(content="Test 1", embedding=[0.1, 0.2, 0.3], metadata={}),
+            Chunk(content="Test 2", embedding=[0.4, 0.5, 0.6], metadata={}),
+        ]
+
+        await vector_db_with_index.insert_chunks(chunks)
+
+        mock_inference_api.embeddings.assert_not_called()
+        mock_index.add_chunks.assert_called_once()
+        args = mock_index.add_chunks.call_args[0]
+        assert args[0] == chunks
+        assert np.array_equal(args[1], np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=np.float32))
+
+    @pytest.mark.asyncio
+    async def test_insert_chunks_with_invalid_embeddings(self):
+        mock_vector_db = MagicMock()
+        mock_vector_db.embedding_dimension = 3
+        mock_vector_db.embedding_model = "test-model with invalid embeddings"
+        mock_index = AsyncMock()
+        mock_inference_api = AsyncMock()
+
+        vector_db_with_index = VectorDBWithIndex(
+            vector_db=mock_vector_db, index=mock_index, inference_api=mock_inference_api
+        )
+
+        # Verify Chunk raises ValueError for invalid embedding type
+        with pytest.raises(ValueError, match="Input should be a valid list"):
+            Chunk(content="Test 1", embedding="invalid_type", metadata={})
+
+        # Verify Chunk raises ValueError for invalid embedding type in insert_chunks (i.e., Chunk errors before insert_chunks is called)
+        with pytest.raises(ValueError, match="Input should be a valid list"):
+            await vector_db_with_index.insert_chunks(
+                [
+                    Chunk(content="Test 1", embedding=None, metadata={}),
+                    Chunk(content="Test 2", embedding="invalid_type", metadata={}),
+                ]
+            )
+
+        # Verify Chunk raises ValueError for invalid embedding element type in insert_chunks (i.e., Chunk errors before insert_chunks is called)
+        with pytest.raises(ValueError, match=" Input should be a valid number, unable to parse string as a number "):
+            await vector_db_with_index.insert_chunks(
+                Chunk(content="Test 1", embedding=[0.1, "string", 0.3], metadata={})
+            )
+
+        chunks_wrong_dim = [
+            Chunk(content="Test 1", embedding=[0.1, 0.2, 0.3, 0.4], metadata={}),
+        ]
+        with pytest.raises(ValueError, match="has dimension 4, expected 3"):
+            await vector_db_with_index.insert_chunks(chunks_wrong_dim)
+
+        mock_inference_api.embeddings.assert_not_called()
+        mock_index.add_chunks.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_insert_chunks_with_partially_precomputed_embeddings(self):
+        mock_vector_db = MagicMock()
+        mock_vector_db.embedding_model = "test-model with partial embeddings"
+        mock_vector_db.embedding_dimension = 3
+        mock_index = AsyncMock()
+        mock_inference_api = AsyncMock()
+
+        vector_db_with_index = VectorDBWithIndex(
+            vector_db=mock_vector_db, index=mock_index, inference_api=mock_inference_api
+        )
+
+        chunks = [
+            Chunk(content="Test 1", embedding=None, metadata={}),
+            Chunk(content="Test 2", embedding=[0.2, 0.2, 0.2], metadata={}),
+            Chunk(content="Test 3", embedding=None, metadata={}),
+        ]
+
+        mock_inference_api.embeddings.return_value.embeddings = [[0.1, 0.1, 0.1], [0.3, 0.3, 0.3]]
+
+        await vector_db_with_index.insert_chunks(chunks)
+
+        mock_inference_api.embeddings.assert_called_once_with(
+            "test-model with partial embeddings", ["Test 1", "Test 3"]
+        )
+        mock_index.add_chunks.assert_called_once()
+        args = mock_index.add_chunks.call_args[0]
+        assert len(args[0]) == 3
+        assert np.array_equal(args[1], np.array([[0.1, 0.1, 0.1], [0.2, 0.2, 0.2], [0.3, 0.3, 0.3]], dtype=np.float32))
diff --git a/tests/unit/registry/test_registry_acl.py b/tests/unit/registry/test_registry_acl.py
index 25ea37bfa..48b3ac51b 100644
--- a/tests/unit/registry/test_registry_acl.py
+++ b/tests/unit/registry/test_registry_acl.py
@@ -8,19 +8,18 @@
 import pytest
 
 from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import ModelWithACL
-from llama_stack.distribution.server.auth_providers import AccessAttributes
+from llama_stack.distribution.datatypes import ModelWithOwner, User
 from llama_stack.distribution.store.registry import CachedDiskDistributionRegistry
 
 
 @pytest.mark.asyncio
 async def test_registry_cache_with_acl(cached_disk_dist_registry):
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="model-acl",
         provider_id="test-provider",
         provider_resource_id="model-acl-resource",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(roles=["admin"], teams=["ai-team"]),
+        owner=User("testuser", {"roles": ["admin"], "teams": ["ai-team"]}),
     )
 
     success = await cached_disk_dist_registry.register(model)
@@ -29,22 +28,14 @@ async def test_registry_cache_with_acl(cached_disk_dist_registry):
     cached_model = cached_disk_dist_registry.get_cached("model", "model-acl")
     assert cached_model is not None
     assert cached_model.identifier == "model-acl"
-    assert cached_model.access_attributes.roles == ["admin"]
-    assert cached_model.access_attributes.teams == ["ai-team"]
+    assert cached_model.owner.principal == "testuser"
+    assert cached_model.owner.attributes["roles"] == ["admin"]
+    assert cached_model.owner.attributes["teams"] == ["ai-team"]
 
     fetched_model = await cached_disk_dist_registry.get("model", "model-acl")
     assert fetched_model is not None
     assert fetched_model.identifier == "model-acl"
-    assert fetched_model.access_attributes.roles == ["admin"]
-
-    model.access_attributes = AccessAttributes(roles=["admin", "user"], projects=["project-x"])
-    await cached_disk_dist_registry.update(model)
-
-    updated_cached = cached_disk_dist_registry.get_cached("model", "model-acl")
-    assert updated_cached is not None
-    assert updated_cached.access_attributes.roles == ["admin", "user"]
-    assert updated_cached.access_attributes.projects == ["project-x"]
-    assert updated_cached.access_attributes.teams is None
+    assert fetched_model.owner.attributes["roles"] == ["admin"]
 
     new_registry = CachedDiskDistributionRegistry(cached_disk_dist_registry.kvstore)
     await new_registry.initialize()
@@ -52,35 +43,32 @@ async def test_registry_cache_with_acl(cached_disk_dist_registry):
     new_model = await new_registry.get("model", "model-acl")
     assert new_model is not None
     assert new_model.identifier == "model-acl"
-    assert new_model.access_attributes.roles == ["admin", "user"]
-    assert new_model.access_attributes.projects == ["project-x"]
-    assert new_model.access_attributes.teams is None
+    assert new_model.owner.principal == "testuser"
+    assert new_model.owner.attributes["roles"] == ["admin"]
+    assert new_model.owner.attributes["teams"] == ["ai-team"]
 
 
 @pytest.mark.asyncio
 async def test_registry_empty_acl(cached_disk_dist_registry):
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="model-empty-acl",
         provider_id="test-provider",
         provider_resource_id="model-resource",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(),
+        owner=User("testuser", None),
     )
 
     await cached_disk_dist_registry.register(model)
 
     cached_model = cached_disk_dist_registry.get_cached("model", "model-empty-acl")
     assert cached_model is not None
-    assert cached_model.access_attributes is not None
-    assert cached_model.access_attributes.roles is None
-    assert cached_model.access_attributes.teams is None
-    assert cached_model.access_attributes.projects is None
-    assert cached_model.access_attributes.namespaces is None
+    assert cached_model.owner is not None
+    assert cached_model.owner.attributes is None
 
     all_models = await cached_disk_dist_registry.get_all()
     assert len(all_models) == 1
 
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="model-no-acl",
         provider_id="test-provider",
         provider_resource_id="model-resource-2",
@@ -91,7 +79,7 @@ async def test_registry_empty_acl(cached_disk_dist_registry):
 
     cached_model = cached_disk_dist_registry.get_cached("model", "model-no-acl")
     assert cached_model is not None
-    assert cached_model.access_attributes is None
+    assert cached_model.owner is None
 
     all_models = await cached_disk_dist_registry.get_all()
     assert len(all_models) == 2
@@ -99,19 +87,19 @@ async def test_registry_empty_acl(cached_disk_dist_registry):
 
 @pytest.mark.asyncio
 async def test_registry_serialization(cached_disk_dist_registry):
-    attributes = AccessAttributes(
-        roles=["admin", "researcher"],
-        teams=["ai-team", "ml-team"],
-        projects=["project-a", "project-b"],
-        namespaces=["prod", "staging"],
-    )
+    attributes = {
+        "roles": ["admin", "researcher"],
+        "teams": ["ai-team", "ml-team"],
+        "projects": ["project-a", "project-b"],
+        "namespaces": ["prod", "staging"],
+    }
 
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="model-serialize",
         provider_id="test-provider",
         provider_resource_id="model-resource",
         model_type=ModelType.llm,
-        access_attributes=attributes,
+        owner=User("bob", attributes),
     )
 
     await cached_disk_dist_registry.register(model)
@@ -122,7 +110,7 @@ async def test_registry_serialization(cached_disk_dist_registry):
     loaded_model = await new_registry.get("model", "model-serialize")
     assert loaded_model is not None
 
-    assert loaded_model.access_attributes.roles == ["admin", "researcher"]
-    assert loaded_model.access_attributes.teams == ["ai-team", "ml-team"]
-    assert loaded_model.access_attributes.projects == ["project-a", "project-b"]
-    assert loaded_model.access_attributes.namespaces == ["prod", "staging"]
+    assert loaded_model.owner.attributes["roles"] == ["admin", "researcher"]
+    assert loaded_model.owner.attributes["teams"] == ["ai-team", "ml-team"]
+    assert loaded_model.owner.attributes["projects"] == ["project-a", "project-b"]
+    assert loaded_model.owner.attributes["namespaces"] == ["prod", "staging"]
diff --git a/tests/unit/server/test_access_control.py b/tests/unit/server/test_access_control.py
index e352ba54d..f9ad47b0c 100644
--- a/tests/unit/server/test_access_control.py
+++ b/tests/unit/server/test_access_control.py
@@ -7,10 +7,13 @@
 from unittest.mock import MagicMock, Mock, patch
 
 import pytest
+import yaml
+from pydantic import TypeAdapter, ValidationError
 
 from llama_stack.apis.datatypes import Api
 from llama_stack.apis.models import ModelType
-from llama_stack.distribution.datatypes import AccessAttributes, ModelWithACL
+from llama_stack.distribution.access_control.access_control import AccessDeniedError, is_action_allowed
+from llama_stack.distribution.datatypes import AccessRule, ModelWithOwner, User
 from llama_stack.distribution.routing_tables.models import ModelsRoutingTable
 
 
@@ -32,39 +35,40 @@ async def test_setup(cached_disk_dist_registry):
     routing_table = ModelsRoutingTable(
         impls_by_provider_id={"test_provider": mock_inference},
         dist_registry=cached_disk_dist_registry,
+        policy={},
     )
     yield cached_disk_dist_registry, routing_table
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.distribution.routing_tables.common.get_auth_attributes")
-async def test_access_control_with_cache(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_access_control_with_cache(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
-    model_public = ModelWithACL(
+    model_public = ModelWithOwner(
         identifier="model-public",
         provider_id="test_provider",
         provider_resource_id="model-public",
         model_type=ModelType.llm,
     )
-    model_admin_only = ModelWithACL(
+    model_admin_only = ModelWithOwner(
         identifier="model-admin",
         provider_id="test_provider",
         provider_resource_id="model-admin",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("testuser", {"roles": ["admin"]}),
     )
-    model_data_scientist = ModelWithACL(
+    model_data_scientist = ModelWithOwner(
         identifier="model-data-scientist",
         provider_id="test_provider",
         provider_resource_id="model-data-scientist",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(roles=["data-scientist", "researcher"], teams=["ml-team"]),
+        owner=User("testuser", {"roles": ["data-scientist", "researcher"], "teams": ["ml-team"]}),
     )
     await registry.register(model_public)
     await registry.register(model_admin_only)
     await registry.register(model_data_scientist)
 
-    mock_get_auth_attributes.return_value = {"roles": ["admin"], "teams": ["management"]}
+    mock_get_authenticated_user.return_value = User("test-user", {"roles": ["admin"], "teams": ["management"]})
     all_models = await routing_table.list_models()
     assert len(all_models.data) == 2
 
@@ -75,7 +79,7 @@ async def test_access_control_with_cache(mock_get_auth_attributes, test_setup):
     with pytest.raises(ValueError):
         await routing_table.get_model("model-data-scientist")
 
-    mock_get_auth_attributes.return_value = {"roles": ["data-scientist"], "teams": ["other-team"]}
+    mock_get_authenticated_user.return_value = User("test-user", {"roles": ["data-scientist"], "teams": ["other-team"]})
     all_models = await routing_table.list_models()
     assert len(all_models.data) == 1
     assert all_models.data[0].identifier == "model-public"
@@ -86,7 +90,7 @@ async def test_access_control_with_cache(mock_get_auth_attributes, test_setup):
     with pytest.raises(ValueError):
         await routing_table.get_model("model-data-scientist")
 
-    mock_get_auth_attributes.return_value = {"roles": ["data-scientist"], "teams": ["ml-team"]}
+    mock_get_authenticated_user.return_value = User("test-user", {"roles": ["data-scientist"], "teams": ["ml-team"]})
     all_models = await routing_table.list_models()
     assert len(all_models.data) == 2
     model_ids = [m.identifier for m in all_models.data]
@@ -102,50 +106,62 @@ async def test_access_control_with_cache(mock_get_auth_attributes, test_setup):
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.distribution.routing_tables.common.get_auth_attributes")
-async def test_access_control_and_updates(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_access_control_and_updates(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
-    model_public = ModelWithACL(
+    model_public = ModelWithOwner(
         identifier="model-updates",
         provider_id="test_provider",
         provider_resource_id="model-updates",
         model_type=ModelType.llm,
     )
     await registry.register(model_public)
-    mock_get_auth_attributes.return_value = {
-        "roles": ["user"],
-    }
+    mock_get_authenticated_user.return_value = User(
+        "test-user",
+        {
+            "roles": ["user"],
+        },
+    )
     model = await routing_table.get_model("model-updates")
     assert model.identifier == "model-updates"
-    model_public.access_attributes = AccessAttributes(roles=["admin"])
+    model_public.owner = User("testuser", {"roles": ["admin"]})
     await registry.update(model_public)
-    mock_get_auth_attributes.return_value = {
-        "roles": ["user"],
-    }
+    mock_get_authenticated_user.return_value = User(
+        "test-user",
+        {
+            "roles": ["user"],
+        },
+    )
     with pytest.raises(ValueError):
         await routing_table.get_model("model-updates")
-    mock_get_auth_attributes.return_value = {
-        "roles": ["admin"],
-    }
+    mock_get_authenticated_user.return_value = User(
+        "test-user",
+        {
+            "roles": ["admin"],
+        },
+    )
     model = await routing_table.get_model("model-updates")
     assert model.identifier == "model-updates"
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.distribution.routing_tables.common.get_auth_attributes")
-async def test_access_control_empty_attributes(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_access_control_empty_attributes(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="model-empty-attrs",
         provider_id="test_provider",
         provider_resource_id="model-empty-attrs",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(),
+        owner=User("testuser", {}),
     )
     await registry.register(model)
-    mock_get_auth_attributes.return_value = {
-        "roles": [],
-    }
+    mock_get_authenticated_user.return_value = User(
+        "test-user",
+        {
+            "roles": [],
+        },
+    )
     result = await routing_table.get_model("model-empty-attrs")
     assert result.identifier == "model-empty-attrs"
     all_models = await routing_table.list_models()
@@ -154,25 +170,25 @@ async def test_access_control_empty_attributes(mock_get_auth_attributes, test_se
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.distribution.routing_tables.common.get_auth_attributes")
-async def test_no_user_attributes(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_no_user_attributes(mock_get_authenticated_user, test_setup):
     registry, routing_table = test_setup
-    model_public = ModelWithACL(
+    model_public = ModelWithOwner(
         identifier="model-public-2",
         provider_id="test_provider",
         provider_resource_id="model-public-2",
         model_type=ModelType.llm,
     )
-    model_restricted = ModelWithACL(
+    model_restricted = ModelWithOwner(
         identifier="model-restricted",
         provider_id="test_provider",
         provider_resource_id="model-restricted",
         model_type=ModelType.llm,
-        access_attributes=AccessAttributes(roles=["admin"]),
+        owner=User("testuser", {"roles": ["admin"]}),
     )
     await registry.register(model_public)
     await registry.register(model_restricted)
-    mock_get_auth_attributes.return_value = None
+    mock_get_authenticated_user.return_value = User("test-user", None)
     model = await routing_table.get_model("model-public-2")
     assert model.identifier == "model-public-2"
 
@@ -185,17 +201,17 @@ async def test_no_user_attributes(mock_get_auth_attributes, test_setup):
 
 
 @pytest.mark.asyncio
-@patch("llama_stack.distribution.routing_tables.common.get_auth_attributes")
-async def test_automatic_access_attributes(mock_get_auth_attributes, test_setup):
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_automatic_access_attributes(mock_get_authenticated_user, test_setup):
     """Test that newly created resources inherit access attributes from their creator."""
     registry, routing_table = test_setup
 
     # Set creator's attributes
     creator_attributes = {"roles": ["data-scientist"], "teams": ["ml-team"], "projects": ["llama-3"]}
-    mock_get_auth_attributes.return_value = creator_attributes
+    mock_get_authenticated_user.return_value = User("test-user", creator_attributes)
 
     # Create model without explicit access attributes
-    model = ModelWithACL(
+    model = ModelWithOwner(
         identifier="auto-access-model",
         provider_id="test_provider",
         provider_resource_id="auto-access-model",
@@ -205,21 +221,346 @@ async def test_automatic_access_attributes(mock_get_auth_attributes, test_setup)
 
     # Verify the model got creator's attributes
     registered_model = await routing_table.get_model("auto-access-model")
-    assert registered_model.access_attributes is not None
-    assert registered_model.access_attributes.roles == ["data-scientist"]
-    assert registered_model.access_attributes.teams == ["ml-team"]
-    assert registered_model.access_attributes.projects == ["llama-3"]
+    assert registered_model.owner is not None
+    assert registered_model.owner.attributes is not None
+    assert registered_model.owner.attributes["roles"] == ["data-scientist"]
+    assert registered_model.owner.attributes["teams"] == ["ml-team"]
+    assert registered_model.owner.attributes["projects"] == ["llama-3"]
 
     # Verify another user without matching attributes can't access it
-    mock_get_auth_attributes.return_value = {"roles": ["engineer"], "teams": ["infra-team"]}
+    mock_get_authenticated_user.return_value = User("test-user", {"roles": ["engineer"], "teams": ["infra-team"]})
     with pytest.raises(ValueError):
         await routing_table.get_model("auto-access-model")
 
     # But a user with matching attributes can
-    mock_get_auth_attributes.return_value = {
-        "roles": ["data-scientist", "engineer"],
-        "teams": ["ml-team", "platform-team"],
-        "projects": ["llama-3"],
-    }
+    mock_get_authenticated_user.return_value = User(
+        "test-user",
+        {
+            "roles": ["data-scientist", "engineer"],
+            "teams": ["ml-team", "platform-team"],
+            "projects": ["llama-3"],
+        },
+    )
     model = await routing_table.get_model("auto-access-model")
     assert model.identifier == "auto-access-model"
+
+
+@pytest.fixture
+async def test_setup_with_access_policy(cached_disk_dist_registry):
+    mock_inference = Mock()
+    mock_inference.__provider_spec__ = MagicMock()
+    mock_inference.__provider_spec__.api = Api.inference
+    mock_inference.register_model = AsyncMock(side_effect=_return_model)
+    mock_inference.unregister_model = AsyncMock(side_effect=_return_model)
+
+    config = """
+                - permit:
+                    principal: user-1
+                    actions: [create, read, delete]
+                    description: user-1 has full access to all models
+                - permit:
+                    principal: user-2
+                    actions: [read]
+                    resource: model::model-1
+                    description: user-2 has read access to model-1 only
+                - permit:
+                    principal: user-3
+                    actions: [read]
+                    resource: model::model-2
+                    description: user-3 has read access to model-2 only
+                - forbid:
+                    actions: [create, read, delete]
+             """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    routing_table = ModelsRoutingTable(
+        impls_by_provider_id={"test_provider": mock_inference},
+        dist_registry=cached_disk_dist_registry,
+        policy=policy,
+    )
+    yield routing_table
+
+
+@pytest.mark.asyncio
+@patch("llama_stack.distribution.routing_tables.common.get_authenticated_user")
+async def test_access_policy(mock_get_authenticated_user, test_setup_with_access_policy):
+    routing_table = test_setup_with_access_policy
+    mock_get_authenticated_user.return_value = User(
+        "user-1",
+        {
+            "roles": ["admin"],
+            "projects": ["foo", "bar"],
+        },
+    )
+    await routing_table.register_model("model-1", provider_id="test_provider")
+    await routing_table.register_model("model-2", provider_id="test_provider")
+    await routing_table.register_model("model-3", provider_id="test_provider")
+    model = await routing_table.get_model("model-1")
+    assert model.identifier == "model-1"
+    model = await routing_table.get_model("model-2")
+    assert model.identifier == "model-2"
+    model = await routing_table.get_model("model-3")
+    assert model.identifier == "model-3"
+
+    mock_get_authenticated_user.return_value = User(
+        "user-2",
+        {
+            "roles": ["user"],
+            "projects": ["foo"],
+        },
+    )
+    model = await routing_table.get_model("model-1")
+    assert model.identifier == "model-1"
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-2")
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-3")
+    with pytest.raises(AccessDeniedError):
+        await routing_table.register_model("model-4", provider_id="test_provider")
+    with pytest.raises(AccessDeniedError):
+        await routing_table.unregister_model("model-1")
+
+    mock_get_authenticated_user.return_value = User(
+        "user-3",
+        {
+            "roles": ["user"],
+            "projects": ["bar"],
+        },
+    )
+    model = await routing_table.get_model("model-2")
+    assert model.identifier == "model-2"
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-1")
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-3")
+    with pytest.raises(AccessDeniedError):
+        await routing_table.register_model("model-5", provider_id="test_provider")
+    with pytest.raises(AccessDeniedError):
+        await routing_table.unregister_model("model-2")
+
+    mock_get_authenticated_user.return_value = User(
+        "user-1",
+        {
+            "roles": ["admin"],
+            "projects": ["foo", "bar"],
+        },
+    )
+    await routing_table.unregister_model("model-3")
+    with pytest.raises(ValueError):
+        await routing_table.get_model("model-3")
+
+
+def test_permit_when():
+    config = """
+    - permit:
+        principal: user-1
+        actions: [read]
+      when: user in owners namespaces
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("testuser", {"namespaces": ["foo"]}),
+    )
+    assert is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["bar"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-2", {"namespaces": ["foo"]}))
+
+
+def test_permit_unless():
+    config = """
+    - permit:
+        principal: user-1
+        actions: [read]
+        resource: model::*
+      unless:
+        - user not in owners namespaces
+        - user in owners teams
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("testuser", {"namespaces": ["foo"]}),
+    )
+    assert is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["bar"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-2", {"namespaces": ["foo"]}))
+
+
+def test_forbid_when():
+    config = """
+    - forbid:
+        principal: user-1
+        actions: [read]
+      when:
+        user in owners namespaces
+    - permit:
+        actions: [read]
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("testuser", {"namespaces": ["foo"]}),
+    )
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["foo"]}))
+    assert is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["bar"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"namespaces": ["foo"]}))
+
+
+def test_forbid_unless():
+    config = """
+    - forbid:
+        principal: user-1
+        actions: [read]
+      unless:
+        user in owners namespaces
+    - permit:
+        actions: [read]
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("testuser", {"namespaces": ["foo"]}),
+    )
+    assert is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"namespaces": ["bar"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"namespaces": ["foo"]}))
+
+
+def test_user_has_attribute():
+    config = """
+    - permit:
+        actions: [read]
+      when: user with admin in roles
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+    )
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"roles": ["basic"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"roles": ["admin"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-3", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-4", None))
+
+
+def test_user_does_not_have_attribute():
+    config = """
+    - permit:
+        actions: [read]
+      unless: user with admin not in roles
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+    )
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"roles": ["basic"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"roles": ["admin"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-3", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-4", None))
+
+
+def test_is_owner():
+    config = """
+    - permit:
+        actions: [read]
+      when: user is owner
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("user-2", {"namespaces": ["foo"]}),
+    )
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"roles": ["basic"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"roles": ["admin"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-3", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-4", None))
+
+
+def test_is_not_owner():
+    config = """
+    - permit:
+        actions: [read]
+      unless: user is not owner
+    """
+    policy = TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+    model = ModelWithOwner(
+        identifier="mymodel",
+        provider_id="myprovider",
+        model_type=ModelType.llm,
+        owner=User("user-2", {"namespaces": ["foo"]}),
+    )
+    assert not is_action_allowed(policy, "read", model, User("user-1", {"roles": ["basic"]}))
+    assert is_action_allowed(policy, "read", model, User("user-2", {"roles": ["admin"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-3", {"namespaces": ["foo"]}))
+    assert not is_action_allowed(policy, "read", model, User("user-4", None))
+
+
+def test_invalid_rule_permit_and_forbid_both_specified():
+    config = """
+    - permit:
+        actions: [read]
+      forbid:
+        actions: [create]
+    """
+    with pytest.raises(ValidationError):
+        TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+
+
+def test_invalid_rule_neither_permit_or_forbid_specified():
+    config = """
+    - when: user is owner
+      unless: user with admin in roles
+    """
+    with pytest.raises(ValidationError):
+        TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+
+
+def test_invalid_rule_when_and_unless_both_specified():
+    config = """
+    - permit:
+        actions: [read]
+      when: user is owner
+      unless: user with admin in roles
+    """
+    with pytest.raises(ValidationError):
+        TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+
+
+def test_invalid_condition():
+    config = """
+    - permit:
+        actions: [read]
+      when: random words that are not valid
+    """
+    with pytest.raises(ValidationError):
+        TypeAdapter(list[AccessRule]).validate_python(yaml.safe_load(config))
+
+
+@pytest.mark.parametrize(
+    "condition",
+    [
+        "user is owner",
+        "user is not owner",
+        "user with dev in teams",
+        "user with default not in namespaces",
+        "user in owners roles",
+        "user not in owners projects",
+    ],
+)
+def test_condition_reprs(condition):
+    from llama_stack.distribution.access_control.conditions import parse_condition
+
+    assert condition == str(parse_condition(condition))
diff --git a/tests/unit/server/test_auth.py b/tests/unit/server/test_auth.py
index 408acb88a..e159aefd1 100644
--- a/tests/unit/server/test_auth.py
+++ b/tests/unit/server/test_auth.py
@@ -139,7 +139,7 @@ async def mock_post_success(*args, **kwargs):
         {
             "message": "Authentication successful",
             "principal": "test-principal",
-            "access_attributes": {
+            "attributes": {
                 "roles": ["admin", "user"],
                 "teams": ["ml-team", "nlp-team"],
                 "projects": ["llama-3", "project-x"],
@@ -233,7 +233,7 @@ async def test_http_middleware_with_access_attributes(mock_http_middleware, mock
             {
                 "message": "Authentication successful",
                 "principal": "test-principal",
-                "access_attributes": {
+                "attributes": {
                     "roles": ["admin", "user"],
                     "teams": ["ml-team", "nlp-team"],
                     "projects": ["llama-3", "project-x"],
@@ -255,33 +255,6 @@ async def test_http_middleware_with_access_attributes(mock_http_middleware, mock
         mock_app.assert_called_once_with(mock_scope, mock_receive, mock_send)
 
 
-@pytest.mark.asyncio
-async def test_http_middleware_no_attributes(mock_http_middleware, mock_scope):
-    """Test middleware behavior with no access attributes"""
-    middleware, mock_app = mock_http_middleware
-    mock_receive = AsyncMock()
-    mock_send = AsyncMock()
-
-    with patch("httpx.AsyncClient") as mock_client:
-        mock_client_instance = AsyncMock()
-        mock_client.return_value.__aenter__.return_value = mock_client_instance
-
-        mock_client_instance.post.return_value = MockResponse(
-            200,
-            {
-                "message": "Authentication successful"
-                # No access_attributes
-            },
-        )
-
-        await middleware(mock_scope, mock_receive, mock_send)
-
-        assert "user_attributes" in mock_scope
-        attributes = mock_scope["user_attributes"]
-        assert "roles" in attributes
-        assert attributes["roles"] == ["test.jwt.token"]
-
-
 # oauth2 token provider tests
 
 
@@ -380,16 +353,16 @@ def test_get_attributes_from_claims():
         "aud": "llama-stack",
     }
     attributes = get_attributes_from_claims(claims, {"sub": "roles", "groups": "teams"})
-    assert attributes.roles == ["my-user"]
-    assert attributes.teams == ["group1", "group2"]
+    assert attributes["roles"] == ["my-user"]
+    assert attributes["teams"] == ["group1", "group2"]
 
     claims = {
         "sub": "my-user",
         "tenant": "my-tenant",
     }
     attributes = get_attributes_from_claims(claims, {"sub": "roles", "tenant": "namespaces"})
-    assert attributes.roles == ["my-user"]
-    assert attributes.namespaces == ["my-tenant"]
+    assert attributes["roles"] == ["my-user"]
+    assert attributes["namespaces"] == ["my-tenant"]
 
     claims = {
         "sub": "my-user",
@@ -408,9 +381,9 @@ def test_get_attributes_from_claims():
             "groups": "teams",
         },
     )
-    assert set(attributes.roles) == {"my-user", "my-username"}
-    assert set(attributes.teams) == {"my-team", "group1", "group2"}
-    assert attributes.namespaces == ["my-tenant"]
+    assert set(attributes["roles"]) == {"my-user", "my-username"}
+    assert set(attributes["teams"]) == {"my-team", "group1", "group2"}
+    assert attributes["namespaces"] == ["my-tenant"]
 
 
 # TODO: add more tests for oauth2 token provider
diff --git a/tests/unit/server/test_resolver.py b/tests/unit/server/test_resolver.py
index bb4c15dbc..acf4da0a3 100644
--- a/tests/unit/server/test_resolver.py
+++ b/tests/unit/server/test_resolver.py
@@ -100,9 +100,10 @@ async def test_resolve_impls_basic():
     add_protocol_methods(SampleImpl, Inference)
 
     mock_module.get_provider_impl = AsyncMock(return_value=impl)
+    mock_module.get_provider_impl.__text_signature__ = "()"
     sys.modules["test_module"] = mock_module
 
-    impls = await resolve_impls(run_config, provider_registry, dist_registry)
+    impls = await resolve_impls(run_config, provider_registry, dist_registry, policy={})
 
     assert Api.inference in impls
     assert isinstance(impls[Api.inference], InferenceRouter)
diff --git a/tests/unit/utils/test_sqlstore.py b/tests/unit/utils/test_sqlstore.py
index 8ded760ef..6231e9082 100644
--- a/tests/unit/utils/test_sqlstore.py
+++ b/tests/unit/utils/test_sqlstore.py
@@ -9,7 +9,7 @@ from tempfile import TemporaryDirectory
 import pytest
 
 from llama_stack.providers.utils.sqlstore.api import ColumnType
-from llama_stack.providers.utils.sqlstore.sqlite.sqlite import SqliteSqlStoreImpl
+from llama_stack.providers.utils.sqlstore.sqlalchemy_sqlstore import SqlAlchemySqlStoreImpl
 from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 
 
@@ -17,7 +17,7 @@ from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 async def test_sqlite_sqlstore():
     with TemporaryDirectory() as tmp_dir:
         db_name = "test.db"
-        sqlstore = SqliteSqlStoreImpl(
+        sqlstore = SqlAlchemySqlStoreImpl(
             SqliteSqlStoreConfig(
                 db_path=tmp_dir + "/" + db_name,
             )
diff --git a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
index 51c7814a3..4d6c19b59 100644
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@@ -36,7 +36,7 @@ test_response_mcp_tool:
   test_params:
     case:
     - case_id: "boiling_point_tool"
-      input: "What is the boiling point of polyjuice?"
+      input: "What is the boiling point of myawesomeliquid in Celsius?"
       tools:
       - type: mcp
         server_label: "localmcp"
@@ -94,3 +94,43 @@ test_response_multi_turn_image:
         output: "llama"
       - input: "What country do you find this animal primarily in? What continent?"
         output: "peru"
+
+test_response_multi_turn_tool_execution:
+  test_name: test_response_multi_turn_tool_execution
+  test_params:
+    case:
+    - case_id: "user_file_access_check"
+      input: "I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response."
+      tools:
+      - type: mcp
+        server_label: "localmcp"
+        server_url: "<FILLED_BY_TEST_RUNNER>"
+      output: "yes"
+    - case_id: "experiment_results_lookup"
+      input: "I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me what you found."
+      tools:
+      - type: mcp
+        server_label: "localmcp"
+        server_url: "<FILLED_BY_TEST_RUNNER>"
+      output: "100°C"
+
+test_response_multi_turn_tool_execution_streaming:
+  test_name: test_response_multi_turn_tool_execution_streaming
+  test_params:
+    case:
+    - case_id: "user_permissions_workflow"
+      input: "Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step."
+      tools:
+      - type: mcp
+        server_label: "localmcp"
+        server_url: "<FILLED_BY_TEST_RUNNER>"
+      stream: true
+      output: "no"
+    - case_id: "experiment_analysis_streaming"
+      input: "I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Please stream your analysis process."
+      tools:
+      - type: mcp
+        server_label: "localmcp"
+        server_url: "<FILLED_BY_TEST_RUNNER>"
+      stream: true
+      output: "85%"
diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py
index 2ce0a3e9c..28020d3b1 100644
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@@ -12,7 +12,7 @@ import pytest
 
 from llama_stack import LlamaStackAsLibraryClient
 from llama_stack.distribution.datatypes import AuthenticationRequiredError
-from tests.common.mcp import make_mcp_server
+from tests.common.mcp import dependency_tools, make_mcp_server
 from tests.verifications.openai_api.fixtures.fixtures import (
     case_id_generator,
     get_base_test_name,
@@ -280,6 +280,7 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
             tools=tools,
             stream=False,
         )
+
         assert len(response.output) >= 3
         list_tools = response.output[0]
         assert list_tools.type == "mcp_list_tools"
@@ -290,11 +291,12 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
         call = response.output[1]
         assert call.type == "mcp_call"
         assert call.name == "get_boiling_point"
-        assert json.loads(call.arguments) == {"liquid_name": "polyjuice", "celcius": True}
+        assert json.loads(call.arguments) == {"liquid_name": "myawesomeliquid", "celsius": True}
         assert call.error is None
         assert "-100" in call.output
 
-        message = response.output[2]
+        # sometimes the model will call the tool again, so we need to get the last message
+        message = response.output[-1]
         text_content = message.content[0].text
         assert "boiling point" in text_content.lower()
 
@@ -393,3 +395,190 @@ def test_response_non_streaming_multi_turn_image(request, openai_client, model,
         previous_response_id = response.id
         output_text = response.output_text.lower()
         assert turn["output"].lower() in output_text
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_multi_turn_tool_execution"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_multi_turn_tool_execution(
+    request, openai_client, model, provider, verification_config, case
+):
+    """Test multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
+        tools = case["tools"]
+        # Replace the placeholder URL with the actual server URL
+        for tool in tools:
+            if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
+                tool["server_url"] = mcp_server_info["server_url"]
+
+        response = openai_client.responses.create(
+            input=case["input"],
+            model=model,
+            tools=tools,
+        )
+
+        # Verify we have MCP tool calls in the output
+        mcp_list_tools = [output for output in response.output if output.type == "mcp_list_tools"]
+        mcp_calls = [output for output in response.output if output.type == "mcp_call"]
+        message_outputs = [output for output in response.output if output.type == "message"]
+
+        # Should have exactly 1 MCP list tools message (at the beginning)
+        assert len(mcp_list_tools) == 1, f"Expected exactly 1 mcp_list_tools, got {len(mcp_list_tools)}"
+        assert mcp_list_tools[0].server_label == "localmcp"
+        assert len(mcp_list_tools[0].tools) == 5  # Updated for dependency tools
+        expected_tool_names = {
+            "get_user_id",
+            "get_user_permissions",
+            "check_file_access",
+            "get_experiment_id",
+            "get_experiment_results",
+        }
+        assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
+
+        assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
+        for mcp_call in mcp_calls:
+            assert mcp_call.error is None, f"MCP call should not have errors, got: {mcp_call.error}"
+
+        assert len(message_outputs) >= 1, f"Expected at least 1 message output, got {len(message_outputs)}"
+
+        final_message = message_outputs[-1]
+        assert final_message.role == "assistant", f"Final message should be from assistant, got {final_message.role}"
+        assert final_message.status == "completed", f"Final message should be completed, got {final_message.status}"
+        assert len(final_message.content) > 0, "Final message should have content"
+
+        expected_output = case["output"]
+        assert expected_output.lower() in response.output_text.lower(), (
+            f"Expected '{expected_output}' to appear in response: {response.output_text}"
+        )
+
+
+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_multi_turn_tool_execution_streaming"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+async def test_response_streaming_multi_turn_tool_execution(
+    request, openai_client, model, provider, verification_config, case
+):
+    """Test streaming multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
+        tools = case["tools"]
+        # Replace the placeholder URL with the actual server URL
+        for tool in tools:
+            if tool["type"] == "mcp" and tool["server_url"] == "<FILLED_BY_TEST_RUNNER>":
+                tool["server_url"] = mcp_server_info["server_url"]
+
+        stream = openai_client.responses.create(
+            input=case["input"],
+            model=model,
+            tools=tools,
+            stream=True,
+        )
+
+        chunks = []
+        async for chunk in stream:
+            chunks.append(chunk)
+
+        # Should have at least response.created and response.completed
+        assert len(chunks) >= 2, f"Expected at least 2 chunks (created + completed), got {len(chunks)}"
+
+        # First chunk should be response.created
+        assert chunks[0].type == "response.created", f"First chunk should be response.created, got {chunks[0].type}"
+
+        # Last chunk should be response.completed
+        assert chunks[-1].type == "response.completed", (
+            f"Last chunk should be response.completed, got {chunks[-1].type}"
+        )
+
+        # Get the final response from the last chunk
+        final_chunk = chunks[-1]
+        if hasattr(final_chunk, "response"):
+            final_response = final_chunk.response
+
+            # Verify multi-turn MCP tool execution results
+            mcp_list_tools = [output for output in final_response.output if output.type == "mcp_list_tools"]
+            mcp_calls = [output for output in final_response.output if output.type == "mcp_call"]
+            message_outputs = [output for output in final_response.output if output.type == "message"]
+
+            # Should have exactly 1 MCP list tools message (at the beginning)
+            assert len(mcp_list_tools) == 1, f"Expected exactly 1 mcp_list_tools, got {len(mcp_list_tools)}"
+            assert mcp_list_tools[0].server_label == "localmcp"
+            assert len(mcp_list_tools[0].tools) == 5  # Updated for dependency tools
+            expected_tool_names = {
+                "get_user_id",
+                "get_user_permissions",
+                "check_file_access",
+                "get_experiment_id",
+                "get_experiment_results",
+            }
+            assert {t["name"] for t in mcp_list_tools[0].tools} == expected_tool_names
+
+            # Should have at least 1 MCP call (the model should call at least one tool)
+            assert len(mcp_calls) >= 1, f"Expected at least 1 mcp_call, got {len(mcp_calls)}"
+
+            # All MCP calls should be completed (verifies our tool execution works)
+            for mcp_call in mcp_calls:
+                assert mcp_call.error is None, f"MCP call should not have errors, got: {mcp_call.error}"
+
+            # Should have at least one final message response
+            assert len(message_outputs) >= 1, f"Expected at least 1 message output, got {len(message_outputs)}"
+
+            # Final message should be from assistant and completed
+            final_message = message_outputs[-1]
+            assert final_message.role == "assistant", (
+                f"Final message should be from assistant, got {final_message.role}"
+            )
+            assert final_message.status == "completed", f"Final message should be completed, got {final_message.status}"
+            assert len(final_message.content) > 0, "Final message should have content"
+
+            # Check that the expected output appears in the response
+            expected_output = case["output"]
+            assert expected_output.lower() in final_response.output_text.lower(), (
+                f"Expected '{expected_output}' to appear in response: {final_response.output_text}"
+            )
+
+
+@pytest.mark.parametrize(
+    "text_format",
+    # Not testing json_object because most providers don't actually support it.
+    [
+        {"type": "text"},
+        {
+            "type": "json_schema",
+            "name": "capitals",
+            "description": "A schema for the capital of each country",
+            "schema": {"type": "object", "properties": {"capital": {"type": "string"}}},
+            "strict": True,
+        },
+    ],
+)
+def test_response_text_format(request, openai_client, model, provider, verification_config, text_format):
+    if isinstance(openai_client, LlamaStackAsLibraryClient):
+        pytest.skip("Responses API text format is not yet supported in library client.")
+
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    stream = False
+    response = openai_client.responses.create(
+        model=model,
+        input="What is the capital of France?",
+        stream=stream,
+        text={"format": text_format},
+    )
+    # by_alias=True is needed because otherwise Pydantic renames our "schema" field
+    assert response.text.format.model_dump(exclude_none=True, by_alias=True) == text_format
+    assert "paris" in response.output_text.lower()
+    if text_format["type"] == "json_schema":
+        assert "paris" in json.loads(response.output_text)["capital"].lower()
diff --git a/uv.lock b/uv.lock
index dae04b5f6..a5dab16bf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1453,10 +1453,11 @@ sdist = { url = "https://files.pythonhosted.org/packages/65/c6/246100fa3967074d9
 
 [[package]]
 name = "llama-stack"
-version = "0.2.8"
+version = "0.2.10"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },
+    { name = "fastapi" },
     { name = "fire" },
     { name = "h11" },
     { name = "httpx" },
@@ -1470,6 +1471,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-dotenv" },
     { name = "python-jose" },
+    { name = "python-multipart" },
     { name = "requests" },
     { name = "rich" },
     { name = "setuptools" },
@@ -1494,7 +1496,6 @@ codegen = [
 ]
 dev = [
     { name = "black" },
-    { name = "fastapi" },
     { name = "nbval" },
     { name = "pre-commit" },
     { name = "pytest" },
@@ -1560,14 +1561,15 @@ unit = [
 [package.metadata]
 requires-dist = [
     { name = "aiohttp" },
+    { name = "fastapi", specifier = ">=0.115.0,<1.0" },
     { name = "fire" },
     { name = "h11", specifier = ">=0.16.0" },
     { name = "httpx" },
     { name = "huggingface-hub" },
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jsonschema" },
-    { name = "llama-stack-client", specifier = ">=0.2.8" },
-    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.8" },
+    { name = "llama-stack-client", specifier = ">=0.2.10" },
+    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.10" },
     { name = "openai", specifier = ">=1.66" },
     { name = "pandas", marker = "extra == 'ui'" },
     { name = "pillow" },
@@ -1575,6 +1577,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2" },
     { name = "python-dotenv" },
     { name = "python-jose" },
+    { name = "python-multipart", specifier = ">=0.0.20" },
     { name = "requests" },
     { name = "rich" },
     { name = "setuptools" },
@@ -1594,7 +1597,6 @@ codegen = [
 ]
 dev = [
     { name = "black" },
-    { name = "fastapi" },
     { name = "nbval" },
     { name = "pre-commit" },
     { name = "pytest" },
@@ -1659,7 +1661,7 @@ unit = [
 
 [[package]]
 name = "llama-stack-client"
-version = "0.2.8"
+version = "0.2.10"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1676,9 +1678,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/61/2e/c29ad9af892c0412b923b6fd5423fae7f1912444edb929c035b93f0540cd/llama_stack_client-0.2.8.tar.gz", hash = "sha256:40cc14ec9ad37969249d972abd681c925f3d4866fc4fa75a016ef1fe87cc7a40", size = 269661, upload-time = "2025-05-27T20:27:55.935Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/f3/d94c0c3d9d9af96daee4d70d79ad4f410a0873d0c0c65dd3f32c75a173ca/llama_stack_client-0.2.10.tar.gz", hash = "sha256:88f0941ab1a3e7600e02144e17ed0c99f2d5a206e197d7d5f46e6087451e13aa", size = 269672, upload-time = "2025-06-05T22:56:02.543Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/28/57/71ee1ad3a3e02dd1d52284a811c5b2caf72f4b704a9177bcf56e6114b56c/llama_stack_client-0.2.8-py3-none-any.whl", hash = "sha256:80b95d136b9a6e39a859578ad7d9536662c564d7f6483e45b010708608e4768d", size = 307589, upload-time = "2025-05-27T20:27:54.153Z" },
+    { url = "https://files.pythonhosted.org/packages/98/3e/9ac50c51459a9c2ee15eb665091e8a7c41609d44ae450cea472df60b6f9b/llama_stack_client-0.2.10-py3-none-any.whl", hash = "sha256:bf5f5fe7015073720a9499f8066f97fc85aa4d21ec24140fbd44d062ad4972cd", size = 307601, upload-time = "2025-06-05T22:56:00.553Z" },
 ]
 
 [[package]]
@@ -2980,6 +2982,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/63/b0/2586ea6b6fd57a994ece0b56418cbe93fff0efb85e2c9eb6b0caf24a4e37/python_jose-3.4.0-py2.py3-none-any.whl", hash = "sha256:9c9f616819652d109bd889ecd1e15e9a162b9b94d682534c9c2146092945b78f", size = 34616, upload-time = "2025-02-18T17:26:40.826Z" },
 ]
 
+[[package]]
+name = "python-multipart"
+version = "0.0.20"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload-time = "2024-12-16T19:45:46.972Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
+]
+
 [[package]]
 name = "pytz"
 version = "2025.1"