diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 134efd93b..f88402a7a 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -9,6 +9,7 @@ updates:
       day: "saturday"
     commit-message:
       prefix: chore(github-deps)
+
   - package-ecosystem: "uv"
     directory: "/"
     schedule:
@@ -19,3 +20,14 @@ updates:
       - python
     commit-message:
       prefix: chore(python-deps)
+
+  - package-ecosystem: npm
+    directory: "/llama_stack/ui"
+    schedule:
+      interval: "weekly"
+      day: "saturday"
+    labels:
+      - type/dependencies
+      - javascript
+    commit-message:
+      prefix: chore(ui-deps)
diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml
index e406d99ee..7a75d85f6 100644
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@@ -17,7 +17,7 @@ jobs:
       pull-requests: write  # for peter-evans/create-pull-request to create a PR
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           ref: main
           fetch-depth: 0
diff --git a/.github/workflows/install-script-ci.yml b/.github/workflows/install-script-ci.yml
index 1ecda6d51..a37919f56 100644
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@@ -16,14 +16,14 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
       - name: Run ShellCheck on install.sh
         run: shellcheck scripts/install.sh
   smoke-test-on-dev:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml
index c328e3b6c..6787806e9 100644
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@@ -18,7 +18,7 @@ on:
       - '.github/workflows/integration-auth-tests.yml' # This workflow
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/integration-sql-store-tests.yml b/.github/workflows/integration-sql-store-tests.yml
index 4e5b64963..3efd970e1 100644
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@@ -16,7 +16,7 @@ on:
       - '.github/workflows/integration-sql-store-tests.yml' # This workflow
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -44,7 +44,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index ba18c27c8..57e582b20 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -65,7 +65,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup test environment
         uses: ./.github/actions/setup-test-environment
diff --git a/.github/workflows/integration-vector-io-tests.yml b/.github/workflows/integration-vector-io-tests.yml
index 61b8e004e..de5701073 100644
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 4f1c143d2..2825c3bf4 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -8,7 +8,7 @@ on:
     branches: [main]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -20,7 +20,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           # For dependabot PRs, we need to checkout with a token that can push changes
           token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@@ -36,6 +36,17 @@ jobs:
             **/requirements*.txt
             .pre-commit-config.yaml
 
+      - name: Set up Node.js
+        uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: 'llama_stack/ui/'
+
+      - name: Install npm dependencies
+        run: npm ci
+        working-directory: llama_stack/ui
+
       - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
         continue-on-error: true
         env:
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index 929d76760..391acbcf8 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -26,7 +26,7 @@ on:
       - 'pyproject.toml'
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -36,7 +36,7 @@ jobs:
       distros: ${{ steps.set-matrix.outputs.distros }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Generate Distribution List
         id: set-matrix
@@ -55,7 +55,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
@@ -79,7 +79,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
@@ -92,7 +92,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
@@ -106,6 +106,10 @@ jobs:
       - name: Inspect the container image entrypoint
         run: |
           IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
+          if [ -z "$IMAGE_ID" ]; then
+            echo "No image found"
+            exit 1
+          fi
           entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
           echo "Entrypoint: $entrypoint"
           if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
@@ -117,7 +121,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
@@ -140,6 +144,10 @@ jobs:
       - name: Inspect UBI9 image
         run: |
           IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
+          if [ -z "$IMAGE_ID" ]; then
+            echo "No image found"
+            exit 1
+          fi
           entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
           echo "Entrypoint: $entrypoint"
           if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
diff --git a/.github/workflows/python-build-test.yml b/.github/workflows/python-build-test.yml
index fe1dfd58a..9de53f7fb 100644
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@@ -21,10 +21,10 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
     - name: Install uv
-      uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
+      uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
       with:
         python-version: ${{ matrix.python-version }}
         activate-environment: true
diff --git a/.github/workflows/record-integration-tests.yml b/.github/workflows/record-integration-tests.yml
index 22636f209..d4f5586e2 100644
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@@ -46,7 +46,7 @@ jobs:
           echo "::endgroup::"
 
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/semantic-pr.yml b/.github/workflows/semantic-pr.yml
index 57a4df646..4adaca84d 100644
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@@ -22,6 +22,6 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check PR Title's semantic conformance
-        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
+        uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test-external-provider-module.yml b/.github/workflows/test-external-provider-module.yml
index d61b0dfe9..8a757b068 100644
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@@ -27,7 +27,7 @@ jobs:
         # container and point 'uv pip install' to the correct path...
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/test-external.yml b/.github/workflows/test-external.yml
index b9db0ad51..7ee467451 100644
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@@ -27,7 +27,7 @@ jobs:
         # container and point 'uv pip install' to the correct path...
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/ui-unit-tests.yml b/.github/workflows/ui-unit-tests.yml
index 00c539c58..2afb92bee 100644
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@@ -13,7 +13,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -26,10 +26,10 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Node.js
-        uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
         with:
           node-version: ${{ matrix.node-version }}
           cache: 'npm'
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index f2a6c7754..dd2097a45 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -18,7 +18,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -32,7 +32,7 @@ jobs:
           - "3.13"
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml
index 1dcfdeca5..e12f0adf8 100644
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@@ -27,7 +27,7 @@ on:
       - '.github/workflows/update-readthedocs.yml'
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
   cancel-in-progress: true
 
 jobs:
@@ -37,7 +37,7 @@ jobs:
       TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
         uses: ./.github/actions/setup-runner
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4309f289a..514fe6d2e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -146,20 +146,32 @@ repos:
         pass_filenames: false
         require_serial: true
         files: ^.github/workflows/.*$
-      - id: ui-prettier
-        name: Format UI code with Prettier
-        entry: bash -c 'cd llama_stack/ui && npm run format'
+      - id: ui-linter
+        name: Format & Lint UI
+        entry: bash ./scripts/run-ui-linter.sh
         language: system
         files: ^llama_stack/ui/.*\.(ts|tsx)$
         pass_filenames: false
         require_serial: true
-      - id: ui-eslint
-        name: Lint UI code with ESLint
-        entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
+
+      - id: check-log-usage
+        name: Ensure 'llama_stack.log' usage for logging
+        entry: bash
         language: system
-        files: ^llama_stack/ui/.*\.(ts|tsx)$
-        pass_filenames: false
-        require_serial: true
+        types: [python]
+        pass_filenames: true
+        args:
+          - -c
+          - |
+            matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
+            if [ -n "$matches" ]; then
+              # GitHub Actions annotation format
+              while IFS=: read -r file line_num rest; do
+                echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
+              done <<< "$matches"
+              exit 1
+            fi
+            exit 0
 
 ci:
     autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index b36626719..a1f6a6f30 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4605,6 +4605,49 @@
                 }
             }
         },
+        "/v1/inference/rerank": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "RerankResponse with indices sorted by relevance score (descending).",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/RerankResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Inference"
+                ],
+                "description": "Rerank a list of documents based on their relevance to a query.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RerankRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
             "post": {
                 "responses": {
@@ -16024,12 +16067,16 @@
                     "value": {
                         "type": "number",
                         "description": "The numeric value of the metric at this timestamp"
+                    },
+                    "unit": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "timestamp",
-                    "value"
+                    "value",
+                    "unit"
                 ],
                 "title": "MetricDataPoint",
                 "description": "A single data point in a metric time series."
@@ -16587,6 +16634,95 @@
                 ],
                 "title": "RegisterVectorDbRequest"
             },
+            "RerankRequest": {
+                "type": "object",
+                "properties": {
+                    "model": {
+                        "type": "string",
+                        "description": "The identifier of the reranking model to use."
+                    },
+                    "query": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
+                            },
+                            {
+                                "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+                            }
+                        ],
+                        "description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
+                    },
+                    "items": {
+                        "type": "array",
+                        "items": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
+                                },
+                                {
+                                    "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+                                }
+                            ]
+                        },
+                        "description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
+                    },
+                    "max_num_results": {
+                        "type": "integer",
+                        "description": "(Optional) Maximum number of results to return. Default: returns all."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "query",
+                    "items"
+                ],
+                "title": "RerankRequest"
+            },
+            "RerankData": {
+                "type": "object",
+                "properties": {
+                    "index": {
+                        "type": "integer",
+                        "description": "The original index of the document in the input list"
+                    },
+                    "relevance_score": {
+                        "type": "number",
+                        "description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "index",
+                    "relevance_score"
+                ],
+                "title": "RerankData",
+                "description": "A single rerank result from a reranking response."
+            },
+            "RerankResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/RerankData"
+                        },
+                        "description": "List of rerank result objects, sorted by relevance score (descending)"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "RerankResponse",
+                "description": "Response from a reranking request."
+            },
             "ResumeAgentTurnRequest": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index e7733b3c3..33142e3ff 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3264,6 +3264,37 @@ paths:
             schema:
               $ref: '#/components/schemas/QueryTracesRequest'
         required: true
+  /v1/inference/rerank:
+    post:
+      responses:
+        '200':
+          description: >-
+            RerankResponse with indices sorted by relevance score (descending).
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/RerankResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: >-
+        Rerank a list of documents based on their relevance to a query.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RerankRequest'
+        required: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
     post:
       responses:
@@ -11923,10 +11954,13 @@ components:
           type: number
           description: >-
             The numeric value of the metric at this timestamp
+        unit:
+          type: string
       additionalProperties: false
       required:
         - timestamp
         - value
+        - unit
       title: MetricDataPoint
       description: >-
         A single data point in a metric time series.
@@ -12337,6 +12371,76 @@ components:
         - vector_db_id
         - embedding_model
       title: RegisterVectorDbRequest
+    RerankRequest:
+      type: object
+      properties:
+        model:
+          type: string
+          description: >-
+            The identifier of the reranking model to use.
+        query:
+          oneOf:
+            - type: string
+            - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+            - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+          description: >-
+            The search query to rank items against. Can be a string, text content
+            part, or image content part. The input must not exceed the model's max
+            input token length.
+        items:
+          type: array
+          items:
+            oneOf:
+              - type: string
+              - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+              - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+          description: >-
+            List of items to rerank. Each item can be a string, text content part,
+            or image content part. Each input must not exceed the model's max input
+            token length.
+        max_num_results:
+          type: integer
+          description: >-
+            (Optional) Maximum number of results to return. Default: returns all.
+      additionalProperties: false
+      required:
+        - model
+        - query
+        - items
+      title: RerankRequest
+    RerankData:
+      type: object
+      properties:
+        index:
+          type: integer
+          description: >-
+            The original index of the document in the input list
+        relevance_score:
+          type: number
+          description: >-
+            The relevance score from the model output. Values are inverted when applicable
+            so that higher scores indicate greater relevance.
+      additionalProperties: false
+      required:
+        - index
+        - relevance_score
+      title: RerankData
+      description: >-
+        A single rerank result from a reranking response.
+    RerankResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/RerankData'
+          description: >-
+            List of rerank result objects, sorted by relevance score (descending)
+      additionalProperties: false
+      required:
+        - data
+      title: RerankResponse
+      description: Response from a reranking request.
     ResumeAgentTurnRequest:
       type: object
       properties:
diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index 335fa3a68..c9677b3b6 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -225,8 +225,32 @@ server:
   port: 8321  # Port to listen on (default: 8321)
   tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
   tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
+  cors: true  # Optional: Enable CORS (dev mode) or full config object
 ```
 
+### CORS Configuration
+
+CORS (Cross-Origin Resource Sharing) can be configured in two ways:
+
+**Local development** (allows localhost origins only):
+```yaml
+server:
+  cors: true
+```
+
+**Explicit configuration** (custom origins and settings):
+```yaml
+server:
+  cors:
+    allow_origins: ["https://myapp.com", "https://app.example.com"]
+    allow_methods: ["GET", "POST", "PUT", "DELETE"]
+    allow_headers: ["Content-Type", "Authorization"]
+    allow_credentials: true
+    max_age: 3600
+```
+
+When `cors: true`, the server enables secure localhost-only access for local development. For production, specify exact origins to maintain security.
+
 ### Authentication Configuration
 
 > **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
@@ -618,6 +642,54 @@ Content-Type: application/json
 }
 ```
 
+### CORS Configuration
+
+Configure CORS to allow web browsers to make requests from different domains. Disabled by default.
+
+#### Quick Setup
+
+For development, use the simple boolean flag:
+
+```yaml
+server:
+  cors: true  # Auto-enables localhost with any port
+```
+
+This automatically allows `http://localhost:*` and `https://localhost:*` with secure defaults.
+
+#### Custom Configuration
+
+For specific origins and full control:
+
+```yaml
+server:
+  cors:
+    allow_origins: ["https://myapp.com", "https://staging.myapp.com"]
+    allow_credentials: true
+    allow_methods: ["GET", "POST", "PUT", "DELETE"]
+    allow_headers: ["Content-Type", "Authorization"]
+    allow_origin_regex: "https://.*\\.example\\.com"  # Optional regex pattern
+    expose_headers: ["X-Total-Count"]
+    max_age: 86400
+```
+
+#### Configuration Options
+
+| Field                | Description                                    | Default |
+| -------------------- | ---------------------------------------------- | ------- |
+| `allow_origins`      | List of allowed origins. Use `["*"]` for any. | `["*"]` |
+| `allow_origin_regex` | Regex pattern for allowed origins (optional). | `None`  |
+| `allow_methods`      | Allowed HTTP methods.                          | `["*"]` |
+| `allow_headers`      | Allowed headers.                               | `["*"]` |
+| `allow_credentials`  | Allow credentials (cookies, auth headers).    | `false` |
+| `expose_headers`     | Headers exposed to browser.                   | `[]`    |
+| `max_age`            | Preflight cache time (seconds).               | `600`   |
+
+**Security Notes**:
+- `allow_credentials: true` requires explicit origins (no wildcards)
+- `cors: true` enables localhost access only (secure for development)
+- For public APIs, always specify exact allowed origins
+
 ## Extending to handle Safety
 
 Configuring Safety can be a little involved so it is instructive to go through an example.
diff --git a/docs/source/distributions/importing_as_library.md b/docs/source/distributions/importing_as_library.md
index fbc48dd95..b9b4b065a 100644
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@@ -17,7 +17,6 @@ client = LlamaStackAsLibraryClient(
     # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
     provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
-client.initialize()
 ```
 
 This will parse your config and set up any inline implementations and remote clients needed for your implementation.
@@ -32,5 +31,4 @@ If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/
 
 ```python
 client = LlamaStackAsLibraryClient(config_path)
-client.initialize()
 ```
diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/docs/source/distributions/k8s-benchmark/benchmark.py
index 0e7368431..3d0d18150 100644
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
diff --git a/docs/source/providers/batches/index.md b/docs/source/providers/batches/index.md
index 2a39a626c..d6d2fa9a3 100644
--- a/docs/source/providers/batches/index.md
+++ b/docs/source/providers/batches/index.md
@@ -2,12 +2,15 @@
 
 ## Overview
 
-Protocol for batch processing API operations.
-
-    The Batches API enables efficient processing of multiple requests in a single operation,
+The Batches API enables efficient processing of multiple requests in a single operation,
     particularly useful for processing large datasets, batch evaluation workflows, and
     cost-effective inference at scale.
 
+    The API is designed to allow use of openai client libraries for seamless integration.
+
+    This API provides the following extensions:
+     - idempotent batch creation
+
     Note: This API is currently under active development and may undergo changes.
 
 This section contains documentation for all available providers for the **batches** API.
diff --git a/docs/source/providers/files/index.md b/docs/source/providers/files/index.md
index 692aad3ca..128953223 100644
--- a/docs/source/providers/files/index.md
+++ b/docs/source/providers/files/index.md
@@ -10,4 +10,5 @@ This section contains documentation for all available providers for the **files*
 :maxdepth: 1
 
 inline_localfs
+remote_s3
 ```
diff --git a/docs/source/providers/files/remote_s3.md b/docs/source/providers/files/remote_s3.md
new file mode 100644
index 000000000..2e3cebabd
--- /dev/null
+++ b/docs/source/providers/files/remote_s3.md
@@ -0,0 +1,33 @@
+# remote::s3
+
+## Description
+
+AWS S3-based file storage provider for scalable cloud file management with metadata persistence.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `bucket_name` | `<class 'str'>` | No |  | S3 bucket name to store files |
+| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
+| `aws_access_key_id` | `str \| None` | No |  | AWS access key ID (optional if using IAM roles) |
+| `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
+| `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
+| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
+| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
+
+## Sample Configuration
+
+```yaml
+bucket_name: ${env.S3_BUCKET_NAME}
+region: ${env.AWS_REGION:=us-east-1}
+aws_access_key_id: ${env.AWS_ACCESS_KEY_ID:=}
+aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
+endpoint_url: ${env.S3_ENDPOINT_URL:=}
+auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
+
+```
+
diff --git a/docs/source/providers/post_training/index.md b/docs/source/providers/post_training/index.md
index c6c92c40e..5ada6f9aa 100644
--- a/docs/source/providers/post_training/index.md
+++ b/docs/source/providers/post_training/index.md
@@ -9,7 +9,9 @@ This section contains documentation for all available providers for the **post_t
 ```{toctree}
 :maxdepth: 1
 
-inline_huggingface
-inline_torchtune
+inline_huggingface-cpu
+inline_huggingface-gpu
+inline_torchtune-cpu
+inline_torchtune-gpu
 remote_nvidia
 ```
diff --git a/docs/source/providers/post_training/inline_huggingface-cpu.md b/docs/source/providers/post_training/inline_huggingface-cpu.md
new file mode 100644
index 000000000..e663fe8f8
--- /dev/null
+++ b/docs/source/providers/post_training/inline_huggingface-cpu.md
@@ -0,0 +1,41 @@
+# inline::huggingface-cpu
+
+## Description
+
+HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `device` | `<class 'str'>` | No | cuda |  |
+| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
+| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
+| `chat_template` | `<class 'str'>` | No | <|user|>
+{input}
+<|assistant|>
+{output} |  |
+| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
+| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
+| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
+| `save_total_limit` | `<class 'int'>` | No | 3 |  |
+| `logging_steps` | `<class 'int'>` | No | 10 |  |
+| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
+| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
+| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
+| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
+| `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
+| `use_reference_model` | `<class 'bool'>` | No | True |  |
+| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
+| `dpo_output_dir` | `<class 'str'>` | No |  |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: huggingface
+distributed_backend: null
+device: cpu
+dpo_output_dir: ~/.llama/dummy/dpo_output
+
+```
+
diff --git a/docs/source/providers/post_training/inline_huggingface-gpu.md b/docs/source/providers/post_training/inline_huggingface-gpu.md
new file mode 100644
index 000000000..21bf965fe
--- /dev/null
+++ b/docs/source/providers/post_training/inline_huggingface-gpu.md
@@ -0,0 +1,41 @@
+# inline::huggingface-gpu
+
+## Description
+
+HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `device` | `<class 'str'>` | No | cuda |  |
+| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
+| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
+| `chat_template` | `<class 'str'>` | No | <|user|>
+{input}
+<|assistant|>
+{output} |  |
+| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} |  |
+| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
+| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
+| `save_total_limit` | `<class 'int'>` | No | 3 |  |
+| `logging_steps` | `<class 'int'>` | No | 10 |  |
+| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
+| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
+| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
+| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
+| `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
+| `use_reference_model` | `<class 'bool'>` | No | True |  |
+| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
+| `dpo_output_dir` | `<class 'str'>` | No |  |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: huggingface
+distributed_backend: null
+device: cpu
+dpo_output_dir: ~/.llama/dummy/dpo_output
+
+```
+
diff --git a/docs/source/providers/post_training/inline_torchtune-cpu.md b/docs/source/providers/post_training/inline_torchtune-cpu.md
new file mode 100644
index 000000000..7204e56e8
--- /dev/null
+++ b/docs/source/providers/post_training/inline_torchtune-cpu.md
@@ -0,0 +1,20 @@
+# inline::torchtune-cpu
+
+## Description
+
+TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `torch_seed` | `int \| None` | No |  |  |
+| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: meta
+
+```
+
diff --git a/docs/source/providers/post_training/inline_torchtune-gpu.md b/docs/source/providers/post_training/inline_torchtune-gpu.md
new file mode 100644
index 000000000..98b94f6f6
--- /dev/null
+++ b/docs/source/providers/post_training/inline_torchtune-gpu.md
@@ -0,0 +1,20 @@
+# inline::torchtune-gpu
+
+## Description
+
+TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `torch_seed` | `int \| None` | No |  |  |
+| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+
+## Sample Configuration
+
+```yaml
+checkpoint_format: meta
+
+```
+
diff --git a/llama_stack/apis/batches/batches.py b/llama_stack/apis/batches/batches.py
index 9297d8597..c6bbd92eb 100644
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@@ -29,12 +29,16 @@ class ListBatchesResponse(BaseModel):
 
 @runtime_checkable
 class Batches(Protocol):
-    """Protocol for batch processing API operations.
-
+    """
     The Batches API enables efficient processing of multiple requests in a single operation,
     particularly useful for processing large datasets, batch evaluation workflows, and
     cost-effective inference at scale.
 
+    The API is designed to allow use of openai client libraries for seamless integration.
+
+    This API provides the following extensions:
+     - idempotent batch creation
+
     Note: This API is currently under active development and may undergo changes.
     """
 
@@ -45,6 +49,7 @@ class Batches(Protocol):
         endpoint: str,
         completion_window: Literal["24h"],
         metadata: dict[str, str] | None = None,
+        idempotency_key: str | None = None,
     ) -> BatchObject:
         """Create a new batch for processing multiple API requests.
 
@@ -52,6 +57,7 @@ class Batches(Protocol):
         :param endpoint: The endpoint to be used for all requests in the batch.
         :param completion_window: The time window within which the batch should be processed.
         :param metadata: Optional metadata for the batch.
+        :param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
         :returns: The created batch object.
         """
         ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 7e7bd0a3d..bd4737ca7 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
     embeddings: list[list[float]]
 
 
+@json_schema_type
+class RerankData(BaseModel):
+    """A single rerank result from a reranking response.
+
+    :param index: The original index of the document in the input list
+    :param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
+    """
+
+    index: int
+    relevance_score: float
+
+
+@json_schema_type
+class RerankResponse(BaseModel):
+    """Response from a reranking request.
+
+    :param data: List of rerank result objects, sorted by relevance score (descending)
+    """
+
+    data: list[RerankData]
+
+
 @json_schema_type
 class OpenAIChatCompletionContentPartTextParam(BaseModel):
     """Text content part for OpenAI-compatible chat completion messages.
@@ -1046,6 +1068,7 @@ class InferenceProvider(Protocol):
         :returns: A BatchCompletionResponse with the full completions.
         """
         raise NotImplementedError("Batch completion is not implemented")
+        return  # this is so mypy's safe-super rule will consider the method concrete
 
     @webmethod(route="/inference/chat-completion", method="POST")
     async def chat_completion(
@@ -1110,6 +1133,7 @@ class InferenceProvider(Protocol):
         :returns: A BatchChatCompletionResponse with the full completions.
         """
         raise NotImplementedError("Batch chat completion is not implemented")
+        return  # this is so mypy's safe-super rule will consider the method concrete
 
     @webmethod(route="/inference/embeddings", method="POST")
     async def embeddings(
@@ -1131,6 +1155,25 @@ class InferenceProvider(Protocol):
         """
         ...
 
+    @webmethod(route="/inference/rerank", method="POST", experimental=True)
+    async def rerank(
+        self,
+        model: str,
+        query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+        items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
+        max_num_results: int | None = None,
+    ) -> RerankResponse:
+        """Rerank a list of documents based on their relevance to a query.
+
+        :param model: The identifier of the reranking model to use.
+        :param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
+        :param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
+        :param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
+        :returns: RerankResponse with indices sorted by relevance score (descending).
+        """
+        raise NotImplementedError("Reranking is not implemented")
+        return  # this is so mypy's safe-super rule will consider the method concrete
+
     @webmethod(route="/openai/v1/completions", method="POST")
     async def openai_completion(
         self,
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 92422ac1b..8d1b5d697 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel):
 
     timestamp: int
     value: float
+    unit: str
 
 
 @json_schema_type
@@ -518,7 +519,7 @@ class Telemetry(Protocol):
         metric_name: str,
         start_time: int,
         end_time: int | None = None,
-        granularity: str | None = "1d",
+        granularity: str | None = None,
         query_type: MetricQueryType = MetricQueryType.RANGE,
         label_matchers: list[MetricLabelMatcher] | None = None,
     ) -> QueryMetricsResponse:
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index c8ffce034..b32b8b3ae 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -15,7 +15,7 @@ from llama_stack.log import get_logger
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
-logger = get_logger(name=__name__, category="server")
+logger = get_logger(name=__name__, category="cli")
 
 
 class StackRun(Subcommand):
diff --git a/llama_stack/core/build.py b/llama_stack/core/build.py
index 4b20588fd..fa1fe632b 100644
--- a/llama_stack/core/build.py
+++ b/llama_stack/core/build.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import importlib.resources
-import logging
 import sys
 
 from pydantic import BaseModel
@@ -17,9 +16,10 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.utils.exec import run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="core")
 
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
diff --git a/llama_stack/core/configure.py b/llama_stack/core/configure.py
index 9e18b438c..64473c053 100644
--- a/llama_stack/core/configure.py
+++ b/llama_stack/core/configure.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import logging
 import textwrap
 from typing import Any
 
@@ -21,9 +20,10 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="core")
 
 
 def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index a1b6ad32b..c3940fcbd 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -318,6 +318,41 @@ class QuotaConfig(BaseModel):
     period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
 
 
+class CORSConfig(BaseModel):
+    allow_origins: list[str] = Field(default_factory=list)
+    allow_origin_regex: str | None = Field(default=None)
+    allow_methods: list[str] = Field(default=["OPTIONS"])
+    allow_headers: list[str] = Field(default_factory=list)
+    allow_credentials: bool = Field(default=False)
+    expose_headers: list[str] = Field(default_factory=list)
+    max_age: int = Field(default=600, ge=0)
+
+    @model_validator(mode="after")
+    def validate_credentials_config(self) -> Self:
+        if self.allow_credentials and (self.allow_origins == ["*"] or "*" in self.allow_origins):
+            raise ValueError("Cannot use wildcard origins with credentials enabled")
+        return self
+
+
+def process_cors_config(cors_config: bool | CORSConfig | None) -> CORSConfig | None:
+    if cors_config is False or cors_config is None:
+        return None
+
+    if cors_config is True:
+        # dev mode: allow localhost on any port
+        return CORSConfig(
+            allow_origins=[],
+            allow_origin_regex=r"https?://localhost:\d+",
+            allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+            allow_headers=["Content-Type", "Authorization", "X-Requested-With"],
+        )
+
+    if isinstance(cors_config, CORSConfig):
+        return cors_config
+
+    raise ValueError(f"Expected bool or CORSConfig, got {type(cors_config).__name__}")
+
+
 class ServerConfig(BaseModel):
     port: int = Field(
         default=8321,
@@ -349,6 +384,12 @@ class ServerConfig(BaseModel):
         default=None,
         description="Per client quota request configuration",
     )
+    cors: bool | CORSConfig | None = Field(
+        default=None,
+        description="CORS configuration for cross-origin requests. Can be:\n"
+        "- true: Enable localhost CORS for development\n"
+        "- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
+    )
 
 
 class StackRunConfig(BaseModel):
diff --git a/llama_stack/core/library_client.py b/llama_stack/core/library_client.py
index a93fe509e..9e7a8006c 100644
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@@ -7,7 +7,7 @@
 import asyncio
 import inspect
 import json
-import logging
+import logging  # allow-direct-logging
 import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
@@ -48,6 +48,7 @@ from llama_stack.core.stack import (
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry.tracing import (
     CURRENT_TRACE_CONTEXT,
     end_trace,
@@ -55,7 +56,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
     start_trace,
 )
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="core")
 
 T = TypeVar("T")
 
@@ -145,39 +146,26 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
     ):
         super().__init__()
         self.async_client = AsyncLlamaStackAsLibraryClient(
-            config_path_or_distro_name, custom_provider_registry, provider_data
+            config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
         )
         self.pool_executor = ThreadPoolExecutor(max_workers=4)
-        self.skip_logger_removal = skip_logger_removal
         self.provider_data = provider_data
 
         self.loop = asyncio.new_event_loop()
 
-    def initialize(self):
-        if in_notebook():
-            import nest_asyncio
-
-            nest_asyncio.apply()
-            if not self.skip_logger_removal:
-                self._remove_root_logger_handlers()
-
         # use a new event loop to avoid interfering with the main event loop
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
         try:
-            return loop.run_until_complete(self.async_client.initialize())
+            loop.run_until_complete(self.async_client.initialize())
         finally:
             asyncio.set_event_loop(None)
 
-    def _remove_root_logger_handlers(self):
+    def initialize(self):
         """
-        Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
+        Deprecated method for backward compatibility.
         """
-        root_logger = logging.getLogger()
-
-        for handler in root_logger.handlers[:]:
-            root_logger.removeHandler(handler)
-            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
+        pass
 
     def request(self, *args, **kwargs):
         loop = self.loop
@@ -215,6 +203,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         config_path_or_distro_name: str,
         custom_provider_registry: ProviderRegistry | None = None,
         provider_data: dict[str, Any] | None = None,
+        skip_logger_removal: bool = False,
     ):
         super().__init__()
         # when using the library client, we should not log to console since many
@@ -222,6 +211,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
         os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
 
+        if in_notebook():
+            import nest_asyncio
+
+            nest_asyncio.apply()
+            if not skip_logger_removal:
+                self._remove_root_logger_handlers()
+
         if config_path_or_distro_name.endswith(".yaml"):
             config_path = Path(config_path_or_distro_name)
             if not config_path.exists():
@@ -238,7 +234,24 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         self.provider_data = provider_data
         self.route_impls: RouteImpls | None = None  # Initialize to None to prevent AttributeError
 
+    def _remove_root_logger_handlers(self):
+        """
+        Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
+        """
+        root_logger = logging.getLogger()
+
+        for handler in root_logger.handlers[:]:
+            root_logger.removeHandler(handler)
+            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
+
     async def initialize(self) -> bool:
+        """
+        Initialize the async client.
+
+        Returns:
+            bool: True if initialization was successful
+        """
+
         try:
             self.route_impls = None
             self.impls = await construct_stack(self.config, self.custom_provider_registry)
diff --git a/llama_stack/core/request_headers.py b/llama_stack/core/request_headers.py
index 35ac72775..f1ce8281f 100644
--- a/llama_stack/core/request_headers.py
+++ b/llama_stack/core/request_headers.py
@@ -6,15 +6,15 @@
 
 import contextvars
 import json
-import logging
 from contextlib import AbstractContextManager
 from typing import Any
 
 from llama_stack.core.datatypes import User
+from llama_stack.log import get_logger
 
 from .utils.dynamic import instantiate_class_type
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="core")
 
 # Context variable for request provider data and auth attributes
 PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
diff --git a/llama_stack/core/routers/datasets.py b/llama_stack/core/routers/datasets.py
index d7984f729..2f1d5f78e 100644
--- a/llama_stack/core/routers/datasets.py
+++ b/llama_stack/core/routers/datasets.py
@@ -12,7 +12,7 @@ from llama_stack.apis.datasets import DatasetPurpose, DataSource
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class DatasetIORouter(DatasetIO):
diff --git a/llama_stack/core/routers/eval_scoring.py b/llama_stack/core/routers/eval_scoring.py
index f7a17eecf..ffca81bf0 100644
--- a/llama_stack/core/routers/eval_scoring.py
+++ b/llama_stack/core/routers/eval_scoring.py
@@ -16,7 +16,7 @@ from llama_stack.apis.scoring import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class ScoringRouter(Scoring):
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 6a3f07247..4b66601bb 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -65,7 +65,7 @@ from llama_stack.providers.datatypes import HealthResponse, HealthStatus, Routin
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
 from llama_stack.providers.utils.telemetry.tracing import get_current_span
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class InferenceRouter(Inference):
diff --git a/llama_stack/core/routers/safety.py b/llama_stack/core/routers/safety.py
index 738ecded3..9ba3327f1 100644
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@@ -13,7 +13,7 @@ from llama_stack.apis.shields import Shield
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class SafetyRouter(Safety):
diff --git a/llama_stack/core/routers/tool_runtime.py b/llama_stack/core/routers/tool_runtime.py
index 5a40bc0c5..fd606f33b 100644
--- a/llama_stack/core/routers/tool_runtime.py
+++ b/llama_stack/core/routers/tool_runtime.py
@@ -22,7 +22,7 @@ from llama_stack.log import get_logger
 
 from ..routing_tables.toolgroups import ToolGroupsRoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class ToolRuntimeRouter(ToolRuntime):
diff --git a/llama_stack/core/routers/vector_io.py b/llama_stack/core/routers/vector_io.py
index 3d0996c49..786b0e391 100644
--- a/llama_stack/core/routers/vector_io.py
+++ b/llama_stack/core/routers/vector_io.py
@@ -30,7 +30,7 @@ from llama_stack.apis.vector_io import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routers")
 
 
 class VectorIORouter(VectorIO):
diff --git a/llama_stack/core/routing_tables/benchmarks.py b/llama_stack/core/routing_tables/benchmarks.py
index 74bee8040..c875dee5b 100644
--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@@ -14,7 +14,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
diff --git a/llama_stack/core/routing_tables/common.py b/llama_stack/core/routing_tables/common.py
index 339ff6da4..e523746d8 100644
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@@ -23,7 +23,7 @@ from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, RoutingTable
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 def get_impl_api(p: Any) -> Api:
diff --git a/llama_stack/core/routing_tables/datasets.py b/llama_stack/core/routing_tables/datasets.py
index fc6a75df4..b129c9ec5 100644
--- a/llama_stack/core/routing_tables/datasets.py
+++ b/llama_stack/core/routing_tables/datasets.py
@@ -26,7 +26,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
diff --git a/llama_stack/core/routing_tables/models.py b/llama_stack/core/routing_tables/models.py
index 34c431e00..b6141efa9 100644
--- a/llama_stack/core/routing_tables/models.py
+++ b/llama_stack/core/routing_tables/models.py
@@ -17,7 +17,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl, lookup_model
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class ModelsRoutingTable(CommonRoutingTableImpl, Models):
diff --git a/llama_stack/core/routing_tables/scoring_functions.py b/llama_stack/core/routing_tables/scoring_functions.py
index 5874ba941..71e5bed63 100644
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@@ -19,7 +19,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
diff --git a/llama_stack/core/routing_tables/shields.py b/llama_stack/core/routing_tables/shields.py
index e08f35bfc..b1918d20a 100644
--- a/llama_stack/core/routing_tables/shields.py
+++ b/llama_stack/core/routing_tables/shields.py
@@ -15,7 +15,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
diff --git a/llama_stack/core/routing_tables/toolgroups.py b/llama_stack/core/routing_tables/toolgroups.py
index 6910b3906..eeea406c1 100644
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@@ -14,7 +14,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:
diff --git a/llama_stack/core/routing_tables/vector_dbs.py b/llama_stack/core/routing_tables/vector_dbs.py
index e8dc46997..00f71b4fe 100644
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@@ -30,7 +30,7 @@ from llama_stack.log import get_logger
 
 from .common import CommonRoutingTableImpl, lookup_model
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="core::routing_tables")
 
 
 class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
diff --git a/llama_stack/core/server/auth.py b/llama_stack/core/server/auth.py
index e4fb4ff2b..c98d3bec0 100644
--- a/llama_stack/core/server/auth.py
+++ b/llama_stack/core/server/auth.py
@@ -15,7 +15,7 @@ from llama_stack.core.server.auth_providers import create_auth_provider
 from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.log import get_logger
 
-logger = get_logger(name=__name__, category="auth")
+logger = get_logger(name=__name__, category="core::auth")
 
 
 class AuthenticationMiddleware:
diff --git a/llama_stack/core/server/auth_providers.py b/llama_stack/core/server/auth_providers.py
index 73d5581c2..a8af6f75a 100644
--- a/llama_stack/core/server/auth_providers.py
+++ b/llama_stack/core/server/auth_providers.py
@@ -23,7 +23,7 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.log import get_logger
 
-logger = get_logger(name=__name__, category="auth")
+logger = get_logger(name=__name__, category="core::auth")
 
 
 class AuthResponse(BaseModel):
diff --git a/llama_stack/core/server/quota.py b/llama_stack/core/server/quota.py
index 1cb850cde..693f224c3 100644
--- a/llama_stack/core/server/quota.py
+++ b/llama_stack/core/server/quota.py
@@ -15,7 +15,7 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
 
-logger = get_logger(name=__name__, category="quota")
+logger = get_logger(name=__name__, category="core::server")
 
 
 class QuotaMiddleware:
diff --git a/llama_stack/core/server/server.py b/llama_stack/core/server/server.py
index cbef8ef88..d6dfc3435 100644
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@@ -9,7 +9,7 @@ import asyncio
 import functools
 import inspect
 import json
-import logging
+import logging  # allow-direct-logging
 import os
 import ssl
 import sys
@@ -28,6 +28,7 @@ from aiohttp import hdrs
 from fastapi import Body, FastAPI, HTTPException, Request, Response
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
@@ -40,6 +41,7 @@ from llama_stack.core.datatypes import (
     AuthenticationRequiredError,
     LoggingConfig,
     StackRunConfig,
+    process_cors_config,
 )
 from llama_stack.core.distribution import builtin_automatically_routed_apis
 from llama_stack.core.external import ExternalApiSpec, load_external_apis
@@ -82,7 +84,7 @@ from .quota import QuotaMiddleware
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
-logger = get_logger(name=__name__, category="server")
+logger = get_logger(name=__name__, category="core::server")
 
 
 def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
@@ -413,7 +415,7 @@ def main(args: argparse.Namespace | None = None):
         config_contents = yaml.safe_load(fp)
         if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
             logger_config = LoggingConfig(**cfg)
-        logger = get_logger(name=__name__, category="server", config=logger_config)
+        logger = get_logger(name=__name__, category="core::server", config=logger_config)
         if args.env:
             for env_pair in args.env:
                 try:
@@ -483,6 +485,12 @@ def main(args: argparse.Namespace | None = None):
             window_seconds=window_seconds,
         )
 
+    if config.server.cors:
+        logger.info("Enabling CORS")
+        cors_config = process_cors_config(config.server.cors)
+        if cors_config:
+            app.add_middleware(CORSMiddleware, **cors_config.model_dump())
+
     if Api.telemetry in impls:
         setup_logger(impls[Api.telemetry])
     else:
diff --git a/llama_stack/core/store/registry.py b/llama_stack/core/store/registry.py
index 4b60e1001..5f4abe9aa 100644
--- a/llama_stack/core/store/registry.py
+++ b/llama_stack/core/store/registry.py
@@ -16,7 +16,7 @@ from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 
-logger = get_logger(__name__, category="core")
+logger = get_logger(__name__, category="core::registry")
 
 
 class DistributionRegistry(Protocol):
diff --git a/llama_stack/core/utils/config_resolution.py b/llama_stack/core/utils/config_resolution.py
index 30cd71e15..182a571ee 100644
--- a/llama_stack/core/utils/config_resolution.py
+++ b/llama_stack/core/utils/config_resolution.py
@@ -10,7 +10,7 @@ from pathlib import Path
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.log import get_logger
 
-logger = get_logger(name=__name__, category="config_resolution")
+logger = get_logger(name=__name__, category="core")
 
 
 DISTRO_DIR = Path(__file__).parent.parent.parent.parent / "llama_stack" / "distributions"
diff --git a/llama_stack/core/utils/exec.py b/llama_stack/core/utils/exec.py
index 1b2b782fe..12fb82d01 100644
--- a/llama_stack/core/utils/exec.py
+++ b/llama_stack/core/utils/exec.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
+import importlib
 import os
 import signal
 import subprocess
@@ -12,9 +12,9 @@ import sys
 
 from termcolor import cprint
 
-log = logging.getLogger(__name__)
+from llama_stack.log import get_logger
 
-import importlib
+log = get_logger(name=__name__, category="core")
 
 
 def formulate_run_args(image_type: str, image_name: str) -> list:
diff --git a/llama_stack/core/utils/prompt_for_config.py b/llama_stack/core/utils/prompt_for_config.py
index 26f6920e0..bac0531ed 100644
--- a/llama_stack/core/utils/prompt_for_config.py
+++ b/llama_stack/core/utils/prompt_for_config.py
@@ -6,7 +6,6 @@
 
 import inspect
 import json
-import logging
 from enum import Enum
 from typing import Annotated, Any, Literal, Union, get_args, get_origin
 
@@ -14,7 +13,9 @@ from pydantic import BaseModel
 from pydantic.fields import FieldInfo
 from pydantic_core import PydanticUndefinedType
 
-log = logging.getLogger(__name__)
+from llama_stack.log import get_logger
+
+log = get_logger(name=__name__, category="core")
 
 
 def is_list_of_primitives(field_type):
diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml
index 0bf42e7ee..b4701cb81 100644
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@@ -34,7 +34,7 @@ distribution_spec:
     telemetry:
     - provider_type: inline::meta-reference
     post_training:
-    - provider_type: inline::huggingface
+    - provider_type: inline::huggingface-cpu
     eval:
     - provider_type: inline::meta-reference
     datasetio:
diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml
index 02a268462..3acdd20f9 100644
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@@ -156,8 +156,8 @@ providers:
       sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
       otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
   post_training:
-  - provider_id: huggingface
-    provider_type: inline::huggingface
+  - provider_id: huggingface-cpu
+    provider_type: inline::huggingface-cpu
     config:
       checkpoint_format: huggingface
       distributed_backend: null
diff --git a/llama_stack/distributions/starter-gpu/__init__.py b/llama_stack/distributions/starter-gpu/__init__.py
new file mode 100644
index 000000000..e762f9b6e
--- /dev/null
+++ b/llama_stack/distributions/starter-gpu/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .starter_gpu import get_distribution_template  # noqa: F401
diff --git a/llama_stack/distributions/starter-gpu/build.yaml b/llama_stack/distributions/starter-gpu/build.yaml
new file mode 100644
index 000000000..ae0680cdc
--- /dev/null
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@@ -0,0 +1,59 @@
+version: 2
+distribution_spec:
+  description: Quick start template for running Llama Stack with several popular providers.
+    This distribution is intended for GPU-enabled environments.
+  providers:
+    inference:
+    - provider_type: remote::cerebras
+    - provider_type: remote::ollama
+    - provider_type: remote::vllm
+    - provider_type: remote::tgi
+    - provider_type: remote::fireworks
+    - provider_type: remote::together
+    - provider_type: remote::bedrock
+    - provider_type: remote::nvidia
+    - provider_type: remote::openai
+    - provider_type: remote::anthropic
+    - provider_type: remote::gemini
+    - provider_type: remote::vertexai
+    - provider_type: remote::groq
+    - provider_type: remote::sambanova
+    - provider_type: inline::sentence-transformers
+    vector_io:
+    - provider_type: inline::faiss
+    - provider_type: inline::sqlite-vec
+    - provider_type: inline::milvus
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
+    files:
+    - provider_type: inline::localfs
+    safety:
+    - provider_type: inline::llama-guard
+    - provider_type: inline::code-scanner
+    agents:
+    - provider_type: inline::meta-reference
+    telemetry:
+    - provider_type: inline::meta-reference
+    post_training:
+    - provider_type: inline::torchtune-gpu
+    eval:
+    - provider_type: inline::meta-reference
+    datasetio:
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
+    scoring:
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
+    tool_runtime:
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
+    batches:
+    - provider_type: inline::reference
+image_type: venv
+additional_pip_packages:
+- aiosqlite
+- asyncpg
+- sqlalchemy[asyncio]
diff --git a/llama_stack/distributions/starter-gpu/run.yaml b/llama_stack/distributions/starter-gpu/run.yaml
new file mode 100644
index 000000000..81c802317
--- /dev/null
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@@ -0,0 +1,238 @@
+version: 2
+image_name: starter-gpu
+apis:
+- agents
+- batches
+- datasetio
+- eval
+- files
+- inference
+- post_training
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
+    provider_type: remote::cerebras
+    config:
+      base_url: https://api.cerebras.ai
+      api_key: ${env.CEREBRAS_API_KEY:=}
+  - provider_id: ${env.OLLAMA_URL:+ollama}
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_URL:=http://localhost:11434}
+  - provider_id: ${env.VLLM_URL:+vllm}
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: ${env.TGI_URL:+tgi}
+    provider_type: remote::tgi
+    config:
+      url: ${env.TGI_URL:=}
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:=}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:=}
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=}
+      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:=}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:=}
+  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+    provider_type: remote::vertexai
+    config:
+      project: ${env.VERTEX_AI_PROJECT:=}
+      location: ${env.VERTEX_AI_LOCATION:=us-central1}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:=}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+  - provider_id: ${env.MILVUS_URL:+milvus}
+    provider_type: inline::milvus
+    config:
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+  - provider_id: ${env.CHROMADB_URL:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+  - provider_id: ${env.PGVECTOR_DB:+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:=localhost}
+      port: ${env.PGVECTOR_PORT:=5432}
+      db: ${env.PGVECTOR_DB:=}
+      user: ${env.PGVECTOR_USER:=}
+      password: ${env.PGVECTOR_PASSWORD:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/responses_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
+      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
+  post_training:
+  - provider_id: torchtune-gpu
+    provider_type: inline::torchtune-gpu
+    config:
+      checkpoint_format: meta
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+  batches:
+  - provider_id: reference
+    provider_type: inline::reference
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/batches.db
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/inference_store.db
+models: []
+shields:
+- shield_id: llama-guard
+  provider_id: ${env.SAFETY_MODEL:+llama-guard}
+  provider_shield_id: ${env.SAFETY_MODEL:=}
+- shield_id: code-scanner
+  provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
+  provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
diff --git a/llama_stack/distributions/starter-gpu/starter_gpu.py b/llama_stack/distributions/starter-gpu/starter_gpu.py
new file mode 100644
index 000000000..893df6c17
--- /dev/null
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack.distributions.template import BuildProvider, DistributionTemplate
+
+from ..starter.starter import get_distribution_template as get_starter_distribution_template
+
+
+def get_distribution_template() -> DistributionTemplate:
+    template = get_starter_distribution_template()
+    name = "starter-gpu"
+    template.name = name
+    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."
+
+    template.providers["post_training"] = [
+        BuildProvider(provider_type="inline::torchtune-gpu"),
+    ]
+    return template
diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml
index 2ad12a165..3df0eb129 100644
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@@ -1,6 +1,7 @@
 version: 2
 distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers
+  description: Quick start template for running Llama Stack with several popular providers.
+    This distribution is intended for CPU-only environments.
   providers:
     inference:
     - provider_type: remote::cerebras
@@ -34,7 +35,7 @@ distribution_spec:
     telemetry:
     - provider_type: inline::meta-reference
     post_training:
-    - provider_type: inline::huggingface
+    - provider_type: inline::huggingface-cpu
     eval:
     - provider_type: inline::meta-reference
     datasetio:
diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml
index 7ac4dc6b9..7e1d46a61 100644
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@@ -156,8 +156,8 @@ providers:
       sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/trace_store.db
       otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
   post_training:
-  - provider_id: huggingface
-    provider_type: inline::huggingface
+  - provider_id: huggingface-cpu
+    provider_type: inline::huggingface-cpu
     config:
       checkpoint_format: huggingface
       distributed_backend: null
diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py
index cad3d72d9..f49da0bb7 100644
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@@ -120,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate:
         ],
         "agents": [BuildProvider(provider_type="inline::meta-reference")],
         "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
-        "post_training": [BuildProvider(provider_type="inline::huggingface")],
+        "post_training": [BuildProvider(provider_type="inline::huggingface-cpu")],
         "eval": [BuildProvider(provider_type="inline::meta-reference")],
         "datasetio": [
             BuildProvider(provider_type="remote::huggingface"),
@@ -178,7 +178,7 @@ def get_distribution_template() -> DistributionTemplate:
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Quick start template for running Llama Stack with several popular providers",
+        description="Quick start template for running Llama Stack with several popular providers. This distribution is intended for CPU-only environments.",
         container_image=None,
         template_path=None,
         providers=providers,
diff --git a/llama_stack/log.py b/llama_stack/log.py
index d67bd1b61..cc4c9d4cf 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
+import logging  # allow-direct-logging
 import os
 import re
-from logging.config import dictConfig
+from logging.config import dictConfig  # allow-direct-logging
 
 from rich.console import Console
 from rich.errors import MarkupError
diff --git a/llama_stack/models/llama/llama3/multimodal/encoder_utils.py b/llama_stack/models/llama/llama3/multimodal/encoder_utils.py
index 5b5969d89..90ced13b2 100644
--- a/llama_stack/models/llama/llama3/multimodal/encoder_utils.py
+++ b/llama_stack/models/llama/llama3/multimodal/encoder_utils.py
@@ -13,14 +13,15 @@
 
 # Copyright (c) Meta Platforms, Inc. and its affiliates.
 import math
-from logging import getLogger
 
 import torch
 import torch.nn.functional as F
 
+from llama_stack.log import get_logger
+
 from .utils import get_negative_inf_value, to_2tuple
 
-logger = getLogger()
+logger = get_logger(name=__name__, category="models::llama")
 
 
 def resize_local_position_embedding(orig_pos_embed, grid_size):
diff --git a/llama_stack/models/llama/llama3/multimodal/image_transform.py b/llama_stack/models/llama/llama3/multimodal/image_transform.py
index f2761ee47..7b20a31fa 100644
--- a/llama_stack/models/llama/llama3/multimodal/image_transform.py
+++ b/llama_stack/models/llama/llama3/multimodal/image_transform.py
@@ -13,7 +13,6 @@
 
 import math
 from collections import defaultdict
-from logging import getLogger
 from typing import Any
 
 import torch
@@ -21,9 +20,11 @@ import torchvision.transforms as tv
 from PIL import Image
 from torchvision.transforms import functional as F
 
+from llama_stack.log import get_logger
+
 IMAGE_RES = 224
 
-logger = getLogger()
+logger = get_logger(name=__name__, category="models::llama")
 
 
 class VariableSizeImageTransform:
diff --git a/llama_stack/models/llama/llama3/multimodal/model.py b/llama_stack/models/llama/llama3/multimodal/model.py
index 5f1c3605c..7b501eb0e 100644
--- a/llama_stack/models/llama/llama3/multimodal/model.py
+++ b/llama_stack/models/llama/llama3/multimodal/model.py
@@ -3,8 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-import logging
 import math
 from collections.abc import Callable
 from functools import partial
@@ -22,6 +20,8 @@ from PIL import Image as PIL_Image
 from torch import Tensor, nn
 from torch.distributed import _functional_collectives as funcol
 
+from llama_stack.log import get_logger
+
 from ..model import ModelArgs, RMSNorm, apply_rotary_emb, precompute_freqs_cis
 from .encoder_utils import (
     build_encoder_attention_mask,
@@ -34,9 +34,10 @@ from .encoder_utils import (
 from .image_transform import VariableSizeImageTransform
 from .utils import get_negative_inf_value, to_2tuple
 
-logger = logging.getLogger(__name__)
 MP_SCALE = 8
 
+logger = get_logger(name=__name__, category="models::llama")
+
 
 def reduce_from_tensor_model_parallel_region(input_):
     """All-reduce the input tensor across model parallel group."""
@@ -771,7 +772,7 @@ class TilePositionEmbedding(nn.Module):
         if embed is not None:
             # reshape the weights to the correct shape
             nt_old, nt_old, _, w = embed.shape
-            logging.info(f"Resizing tile embedding from {nt_old}x{nt_old} to {self.num_tiles}x{self.num_tiles}")
+            logger.info(f"Resizing tile embedding from {nt_old}x{nt_old} to {self.num_tiles}x{self.num_tiles}")
             embed_new = TilePositionEmbedding._dynamic_resize(embed, self.num_tiles)
             # assign the weights to the module
             state_dict[prefix + "embedding"] = embed_new
diff --git a/llama_stack/models/llama/llama3/tokenizer.py b/llama_stack/models/llama/llama3/tokenizer.py
index e47b579e3..ad7ced1c5 100644
--- a/llama_stack/models/llama/llama3/tokenizer.py
+++ b/llama_stack/models/llama/llama3/tokenizer.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+
 from collections.abc import Collection, Iterator, Sequence, Set
-from logging import getLogger
 from pathlib import Path
 from typing import (
     Literal,
@@ -14,11 +14,9 @@ from typing import (
 
 import tiktoken
 
+from llama_stack.log import get_logger
 from llama_stack.models.llama.tokenizer_utils import load_bpe_file
 
-logger = getLogger(__name__)
-
-
 # The tiktoken tokenizer can handle <=400k chars without
 # pyo3_runtime.PanicException.
 TIKTOKEN_MAX_ENCODE_CHARS = 400_000
@@ -31,6 +29,8 @@ MAX_NO_WHITESPACES_CHARS = 25_000
 
 _INSTANCE = None
 
+logger = get_logger(name=__name__, category="models::llama")
+
 
 class Tokenizer:
     """
diff --git a/llama_stack/models/llama/llama3/tool_utils.py b/llama_stack/models/llama/llama3/tool_utils.py
index 574080184..d0e3e7671 100644
--- a/llama_stack/models/llama/llama3/tool_utils.py
+++ b/llama_stack/models/llama/llama3/tool_utils.py
@@ -11,7 +11,7 @@ from llama_stack.log import get_logger
 
 from ..datatypes import BuiltinTool, RecursiveType, ToolCall, ToolPromptFormat
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="models::llama")
 
 BUILTIN_TOOL_PATTERN = r'\b(?P<tool_name>\w+)\.call\(query="(?P<query>[^"]*)"\)'
 CUSTOM_TOOL_CALL_PATTERN = re.compile(r"<function=(?P<function_name>[^}]+)>(?P<args>{.*?})")
diff --git a/llama_stack/models/llama/llama4/quantization/loader.py b/llama_stack/models/llama/llama4/quantization/loader.py
index 223744a5f..7557a8a64 100644
--- a/llama_stack/models/llama/llama4/quantization/loader.py
+++ b/llama_stack/models/llama/llama4/quantization/loader.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import os
 from collections.abc import Callable
 
@@ -13,11 +12,13 @@ from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
 from torch import Tensor, nn
 from torch.nn import functional as F
 
+from llama_stack.log import get_logger
+
 from ...datatypes import QuantizationMode
 from ..model import Transformer, TransformerBlock
 from ..moe import MoE
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="models::llama")
 
 
 def swiglu_wrapper_no_reduce(
diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py
index e12b2cae0..bfbace8f9 100644
--- a/llama_stack/models/llama/llama4/tokenizer.py
+++ b/llama_stack/models/llama/llama4/tokenizer.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from collections.abc import Collection, Iterator, Sequence, Set
-from logging import getLogger
 from pathlib import Path
 from typing import (
     Literal,
@@ -14,11 +13,9 @@ from typing import (
 
 import tiktoken
 
+from llama_stack.log import get_logger
 from llama_stack.models.llama.tokenizer_utils import load_bpe_file
 
-logger = getLogger(__name__)
-
-
 # The tiktoken tokenizer can handle <=400k chars without
 # pyo3_runtime.PanicException.
 TIKTOKEN_MAX_ENCODE_CHARS = 400_000
@@ -101,6 +98,8 @@ BASIC_SPECIAL_TOKENS = [
     "<|fim_suffix|>",
 ]
 
+logger = get_logger(name=__name__, category="models::llama")
+
 
 class Tokenizer:
     """
diff --git a/llama_stack/models/llama/quantize_impls.py b/llama_stack/models/llama/quantize_impls.py
index a6400c5c9..0a205601f 100644
--- a/llama_stack/models/llama/quantize_impls.py
+++ b/llama_stack/models/llama/quantize_impls.py
@@ -6,9 +6,10 @@
 
 # type: ignore
 import collections
-import logging
 
-log = logging.getLogger(__name__)
+from llama_stack.log import get_logger
+
+log = get_logger(name=__name__, category="models::llama")
 
 try:
     import fbgemm_gpu.experimental.gen_ai  # noqa: F401
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 5f7c90879..fde38515b 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -84,7 +84,7 @@ MEMORY_QUERY_TOOL = "knowledge_search"
 WEB_SEARCH_TOOL = "web_search"
 RAG_TOOL_GROUP = "builtin::rag"
 
-logger = get_logger(name=__name__, category="agents")
+logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class ChatAgent(ShieldRunnerMixin):
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index 30196c429..8bdde86b0 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import uuid
 from collections.abc import AsyncGenerator
 from datetime import UTC, datetime
@@ -42,6 +41,7 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.datatypes import AccessRule
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
 from llama_stack.providers.utils.pagination import paginate_records
 from llama_stack.providers.utils.responses.responses_store import ResponsesStore
@@ -51,7 +51,7 @@ from .config import MetaReferenceAgentsImplConfig
 from .persistence import AgentInfo
 from .responses.openai_responses import OpenAIResponsesImpl
 
-logger = logging.getLogger()
+logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class MetaReferenceAgentsImpl(Agents):
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 0b234d96c..3b7b4729c 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import json
-import logging
 import uuid
 from datetime import UTC, datetime
 
@@ -15,9 +14,10 @@ from llama_stack.core.access_control.access_control import AccessDeniedError, is
 from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.core.datatypes import User
 from llama_stack.core.request_headers import get_authenticated_user
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class AgentSessionInfo(Session):
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
index e528a4005..c632e61aa 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -41,7 +41,7 @@ from .utils import (
     convert_response_text_to_chat_response_format,
 )
 
-logger = get_logger(name=__name__, category="responses")
+logger = get_logger(name=__name__, category="openai::responses")
 
 
 class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
index 0879e978a..3e69fa5cd 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -47,7 +47,7 @@ from llama_stack.log import get_logger
 from .types import ChatCompletionContext, ChatCompletionResult
 from .utils import convert_chat_choice_to_response_message, is_function_tool_call
 
-logger = get_logger(name=__name__, category="responses")
+logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class StreamingResponseOrchestrator:
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
index 5b98b4f51..b028c018b 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@@ -38,7 +38,7 @@ from llama_stack.log import get_logger
 
 from .types import ChatCompletionContext, ToolExecutionResult
 
-logger = get_logger(name=__name__, category="responses")
+logger = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class ToolExecutor:
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
index 1507a55c8..7aaeb4cd5 100644
--- a/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@@ -17,6 +17,8 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseOutputMessageContent,
     OpenAIResponseOutputMessageContentOutputText,
     OpenAIResponseOutputMessageFunctionToolCall,
+    OpenAIResponseOutputMessageMCPCall,
+    OpenAIResponseOutputMessageMCPListTools,
     OpenAIResponseText,
 )
 from llama_stack.apis.inference import (
@@ -99,14 +101,22 @@ async def convert_response_input_to_chat_messages(
     """
     messages: list[OpenAIMessageParam] = []
     if isinstance(input, list):
+        # extract all OpenAIResponseInputFunctionToolCallOutput items
+        # so their corresponding OpenAIToolMessageParam instances can
+        # be added immediately following the corresponding
+        # OpenAIAssistantMessageParam
+        tool_call_results = {}
         for input_item in input:
             if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput):
-                messages.append(
-                    OpenAIToolMessageParam(
-                        content=input_item.output,
-                        tool_call_id=input_item.call_id,
-                    )
+                tool_call_results[input_item.call_id] = OpenAIToolMessageParam(
+                    content=input_item.output,
+                    tool_call_id=input_item.call_id,
                 )
+
+        for input_item in input:
+            if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput):
+                # skip as these have been extracted and inserted in order
+                pass
             elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall):
                 tool_call = OpenAIChatCompletionToolCall(
                     index=0,
@@ -117,6 +127,28 @@ async def convert_response_input_to_chat_messages(
                     ),
                 )
                 messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+                if input_item.call_id in tool_call_results:
+                    messages.append(tool_call_results[input_item.call_id])
+                    del tool_call_results[input_item.call_id]
+            elif isinstance(input_item, OpenAIResponseOutputMessageMCPCall):
+                tool_call = OpenAIChatCompletionToolCall(
+                    index=0,
+                    id=input_item.id,
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=input_item.name,
+                        arguments=input_item.arguments,
+                    ),
+                )
+                messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+                messages.append(
+                    OpenAIToolMessageParam(
+                        content=input_item.output,
+                        tool_call_id=input_item.id,
+                    )
+                )
+            elif isinstance(input_item, OpenAIResponseOutputMessageMCPListTools):
+                # the tool list will be handled separately
+                pass
             else:
                 content = await convert_response_content_to_chat_content(input_item.content)
                 message_type = await get_message_type_by_role(input_item.role)
@@ -125,6 +157,10 @@ async def convert_response_input_to_chat_messages(
                         f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context"
                     )
                 messages.append(message_type(content=content))
+        if len(tool_call_results):
+            raise ValueError(
+                f"Received function_call_output(s) with call_id(s) {tool_call_results.keys()}, but no corresponding function_call"
+            )
     else:
         messages.append(OpenAIUserMessageParam(content=input))
     return messages
diff --git a/llama_stack/providers/inline/agents/meta_reference/safety.py b/llama_stack/providers/inline/agents/meta_reference/safety.py
index 605f387b7..8f3ecf5c9 100644
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@@ -5,13 +5,13 @@
 # the root directory of this source tree.
 
 import asyncio
-import logging
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry import tracing
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="agents::meta_reference")
 
 
 class SafetyException(Exception):  # noqa: N818
diff --git a/llama_stack/providers/inline/batches/reference/batches.py b/llama_stack/providers/inline/batches/reference/batches.py
index 1ff554e70..26f0ad15a 100644
--- a/llama_stack/providers/inline/batches/reference/batches.py
+++ b/llama_stack/providers/inline/batches/reference/batches.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 import asyncio
+import hashlib
 import itertools
 import json
 import time
@@ -136,28 +137,45 @@ class ReferenceBatchesImpl(Batches):
         endpoint: str,
         completion_window: Literal["24h"],
         metadata: dict[str, str] | None = None,
+        idempotency_key: str | None = None,
     ) -> BatchObject:
         """
         Create a new batch for processing multiple API requests.
 
-        Error handling by levels -
-         0. Input param handling, results in 40x errors before processing, e.g.
-           - Wrong completion_window
-           - Invalid metadata types
-           - Unknown endpoint
-          -> no batch created
-         1. Errors preventing processing, result in BatchErrors aggregated in process_batch, e.g.
-           - input_file_id missing
-           - invalid json in file
-           - missing custom_id, method, url, body
-           - invalid model
-           - streaming
-          -> batch created, validation sends to failed status
-         2. Processing errors, result in error_file_id entries, e.g.
-           - Any error returned from inference endpoint
-          -> batch created, goes to completed status
+        This implementation provides optional idempotency: when an idempotency key
+        (idempotency_key) is provided, a deterministic ID is generated based on the input
+        parameters. If a batch with the same parameters already exists, it will be
+        returned instead of creating a duplicate. Without an idempotency key,
+        each request creates a new batch with a unique ID.
+
+        Args:
+            input_file_id: The ID of an uploaded file containing requests for the batch.
+            endpoint: The endpoint to be used for all requests in the batch.
+            completion_window: The time window within which the batch should be processed.
+            metadata: Optional metadata for the batch.
+            idempotency_key: Optional idempotency key for enabling idempotent behavior.
+
+        Returns:
+            The created or existing batch object.
         """
 
+        # Error handling by levels -
+        #  0. Input param handling, results in 40x errors before processing, e.g.
+        #    - Wrong completion_window
+        #    - Invalid metadata types
+        #    - Unknown endpoint
+        #   -> no batch created
+        #  1. Errors preventing processing, result in BatchErrors aggregated in process_batch, e.g.
+        #    - input_file_id missing
+        #    - invalid json in file
+        #    - missing custom_id, method, url, body
+        #    - invalid model
+        #    - streaming
+        #   -> batch created, validation sends to failed status
+        #  2. Processing errors, result in error_file_id entries, e.g.
+        #    - Any error returned from inference endpoint
+        #   -> batch created, goes to completed status
+
         # TODO: set expiration time for garbage collection
 
         if endpoint not in ["/v1/chat/completions"]:
@@ -171,6 +189,35 @@ class ReferenceBatchesImpl(Batches):
             )
 
         batch_id = f"batch_{uuid.uuid4().hex[:16]}"
+
+        # For idempotent requests, use the idempotency key for the batch ID
+        # This ensures the same key always maps to the same batch ID,
+        # allowing us to detect parameter conflicts
+        if idempotency_key is not None:
+            hash_input = idempotency_key.encode("utf-8")
+            hash_digest = hashlib.sha256(hash_input).hexdigest()[:24]
+            batch_id = f"batch_{hash_digest}"
+
+            try:
+                existing_batch = await self.retrieve_batch(batch_id)
+
+                if (
+                    existing_batch.input_file_id != input_file_id
+                    or existing_batch.endpoint != endpoint
+                    or existing_batch.completion_window != completion_window
+                    or existing_batch.metadata != metadata
+                ):
+                    raise ConflictError(
+                        f"Idempotency key '{idempotency_key}' was previously used with different parameters. "
+                        "Either use a new idempotency key or ensure all parameters match the original request."
+                    )
+
+                logger.info(f"Returning existing batch with ID: {batch_id}")
+                return existing_batch
+            except ResourceNotFoundError:
+                # Batch doesn't exist, continue with creation
+                pass
+
         current_time = int(time.time())
 
         batch = BatchObject(
@@ -185,6 +232,7 @@ class ReferenceBatchesImpl(Batches):
         )
 
         await self.kvstore.set(f"batch:{batch_id}", batch.to_json())
+        logger.info(f"Created new batch with ID: {batch_id}")
 
         if self.process_batches:
             task = asyncio.create_task(self._process_batch(batch_id))
diff --git a/llama_stack/providers/inline/files/localfs/files.py b/llama_stack/providers/inline/files/localfs/files.py
index 1e9dca3b5..4f6d571a4 100644
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@@ -11,6 +11,7 @@ from typing import Annotated
 
 from fastapi import File, Form, Response, UploadFile
 
+from llama_stack.apis.common.errors import ResourceNotFoundError
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import (
     Files,
@@ -20,12 +21,15 @@ from llama_stack.apis.files import (
     OpenAIFilePurpose,
 )
 from llama_stack.core.datatypes import AccessRule
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
 from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
 from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
 
 from .config import LocalfsFilesImplConfig
 
+logger = get_logger(name=__name__, category="files")
+
 
 class LocalfsFilesImpl(Files):
     def __init__(self, config: LocalfsFilesImplConfig, policy: list[AccessRule]) -> None:
@@ -65,6 +69,18 @@ class LocalfsFilesImpl(Files):
         """Get the filesystem path for a file ID."""
         return Path(self.config.storage_dir) / file_id
 
+    async def _lookup_file_id(self, file_id: str) -> tuple[OpenAIFileObject, Path]:
+        """Look up a OpenAIFileObject and filesystem path from its ID."""
+        if not self.sql_store:
+            raise RuntimeError("Files provider not initialized")
+
+        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
+        if not row:
+            raise ResourceNotFoundError(file_id, "File", "client.files.list()")
+
+        file_path = Path(row.pop("file_path"))
+        return OpenAIFileObject(**row), file_path
+
     # OpenAI Files API Implementation
     async def openai_upload_file(
         self,
@@ -157,37 +173,19 @@ class LocalfsFilesImpl(Files):
 
     async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
         """Returns information about a specific file."""
-        if not self.sql_store:
-            raise RuntimeError("Files provider not initialized")
+        file_obj, _ = await self._lookup_file_id(file_id)
 
-        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
-        if not row:
-            raise ValueError(f"File with id {file_id} not found")
-
-        return OpenAIFileObject(
-            id=row["id"],
-            filename=row["filename"],
-            purpose=OpenAIFilePurpose(row["purpose"]),
-            bytes=row["bytes"],
-            created_at=row["created_at"],
-            expires_at=row["expires_at"],
-        )
+        return file_obj
 
     async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
         """Delete a file."""
-        if not self.sql_store:
-            raise RuntimeError("Files provider not initialized")
-
-        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
-        if not row:
-            raise ValueError(f"File with id {file_id} not found")
-
         # Delete physical file
-        file_path = Path(row["file_path"])
+        _, file_path = await self._lookup_file_id(file_id)
         if file_path.exists():
             file_path.unlink()
 
         # Delete metadata from database
+        assert self.sql_store is not None, "Files provider not initialized"
         await self.sql_store.delete("openai_files", where={"id": file_id})
 
         return OpenAIFileDeleteResponse(
@@ -197,25 +195,17 @@ class LocalfsFilesImpl(Files):
 
     async def openai_retrieve_file_content(self, file_id: str) -> Response:
         """Returns the contents of the specified file."""
-        if not self.sql_store:
-            raise RuntimeError("Files provider not initialized")
-
-        # Get file metadata
-        row = await self.sql_store.fetch_one("openai_files", policy=self.policy, where={"id": file_id})
-        if not row:
-            raise ValueError(f"File with id {file_id} not found")
-
         # Read file content
-        file_path = Path(row["file_path"])
-        if not file_path.exists():
-            raise ValueError(f"File content not found on disk: {file_path}")
+        file_obj, file_path = await self._lookup_file_id(file_id)
 
-        with open(file_path, "rb") as f:
-            content = f.read()
+        if not file_path.exists():
+            logger.warning(f"File '{file_id}'s underlying '{file_path}' is missing, deleting metadata.")
+            await self.openai_delete_file(file_id)
+            raise ResourceNotFoundError(file_id, "File", "client.files.list()")
 
         # Return as binary response with appropriate content type
         return Response(
-            content=content,
+            content=file_path.read_bytes(),
             media_type="application/octet-stream",
-            headers={"Content-Disposition": f'attachment; filename="{row["filename"]}"'},
+            headers={"Content-Disposition": f'attachment; filename="{file_obj.filename}"'},
         )
diff --git a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
index 7ade75032..bb6a1bd03 100644
--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@@ -12,7 +12,6 @@
 
 import copy
 import json
-import logging
 import multiprocessing
 import os
 import tempfile
@@ -32,13 +31,14 @@ from fairscale.nn.model_parallel.initialize import (
 from pydantic import BaseModel, Field
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
 
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import GenerationResult
 from llama_stack.providers.utils.inference.prompt_adapter import (
     ChatCompletionRequestWithRawContent,
     CompletionRequestWithRawContent,
 )
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="inference")
 
 
 class ProcessingMessageName(str, Enum):
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index fea8a8189..34665b63e 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -4,13 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from collections.abc import AsyncGenerator
 
 from llama_stack.apis.inference import (
     CompletionResponse,
     InferenceProvider,
-    InterleavedContent,
     LogProbConfig,
     Message,
     ResponseFormat,
@@ -21,6 +19,7 @@ from llama_stack.apis.inference import (
     ToolPromptFormat,
 )
 from llama_stack.apis.models import ModelType
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
     SentenceTransformerEmbeddingMixin,
@@ -32,7 +31,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
 
 from .config import SentenceTransformersInferenceConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="inference")
 
 
 class SentenceTransformersInferenceImpl(
@@ -100,25 +99,3 @@ class SentenceTransformersInferenceImpl(
         tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         raise ValueError("Sentence transformers don't support chat completion")
-
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch completion is not supported for Sentence Transformers")
-
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
diff --git a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
index 2574b995b..d9ee3d2a8 100644
--- a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
+++ b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
@@ -6,7 +6,6 @@
 
 import gc
 import json
-import logging
 import multiprocessing
 from pathlib import Path
 from typing import Any
@@ -28,6 +27,7 @@ from llama_stack.apis.post_training import (
     LoraFinetuningConfig,
     TrainingConfig,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
 
 from ..config import HuggingFacePostTrainingConfig
@@ -44,7 +44,7 @@ from ..utils import (
     split_dataset,
 )
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="post_training")
 
 
 class HFFinetuningSingleDevice:
diff --git a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py
index a7c19faac..b39a24c66 100644
--- a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py
+++ b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import gc
-import logging
 import multiprocessing
 from pathlib import Path
 from typing import Any
@@ -24,6 +23,7 @@ from llama_stack.apis.post_training import (
     DPOAlignmentConfig,
     TrainingConfig,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
 
 from ..config import HuggingFacePostTrainingConfig
@@ -40,7 +40,7 @@ from ..utils import (
     split_dataset,
 )
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="post_training")
 
 
 class HFDPOAlignmentSingleDevice:
diff --git a/llama_stack/providers/inline/post_training/huggingface/utils.py b/llama_stack/providers/inline/post_training/huggingface/utils.py
index 3147c19ab..f229c87dd 100644
--- a/llama_stack/providers/inline/post_training/huggingface/utils.py
+++ b/llama_stack/providers/inline/post_training/huggingface/utils.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import os
 import signal
 import sys
@@ -19,10 +18,11 @@ from transformers import AutoConfig, AutoModelForCausalLM
 
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.post_training import Checkpoint, TrainingConfig
+from llama_stack.log import get_logger
 
 from .config import HuggingFacePostTrainingConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="post_training")
 
 
 def setup_environment():
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index 49e1c95b8..8b1462862 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import os
 import time
 from datetime import UTC, datetime
@@ -19,6 +18,7 @@ from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import modules, training
 from torchtune import utils as torchtune_utils
 from torchtune.data import padded_collate_sft
+from torchtune.models.llama3._tokenizer import Llama3Tokenizer
 from torchtune.modules.loss import CEWithChunkedOutputLoss
 from torchtune.modules.peft import (
     get_adapter_params,
@@ -45,6 +45,7 @@ from llama_stack.apis.post_training import (
 )
 from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.core.utils.model_utils import model_local_dir
+from llama_stack.log import get_logger
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
 from llama_stack.providers.inline.post_training.torchtune.common import utils
@@ -56,9 +57,7 @@ from llama_stack.providers.inline.post_training.torchtune.config import (
 )
 from llama_stack.providers.inline.post_training.torchtune.datasets.sft import SFTDataset
 
-log = logging.getLogger(__name__)
-
-from torchtune.models.llama3._tokenizer import Llama3Tokenizer
+log = get_logger(name=__name__, category="post_training")
 
 
 class LoraFinetuningSingleDevice:
diff --git a/llama_stack/providers/inline/safety/code_scanner/code_scanner.py b/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
index 6e05d5b83..5e25c559f 100644
--- a/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
+++ b/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import uuid
 from typing import TYPE_CHECKING, Any
 
@@ -20,13 +19,14 @@ from llama_stack.apis.safety import (
 )
 from llama_stack.apis.safety.safety import ModerationObject, ModerationObjectResults
 from llama_stack.apis.shields import Shield
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import (
     interleaved_content_as_str,
 )
 
 from .config import CodeScannerConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="safety")
 
 ALLOWED_CODE_SCANNER_MODEL_IDS = [
     "code-scanner",
diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
index 5d52c5d89..5c7f30aa7 100644
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import re
 import uuid
 from string import Template
@@ -21,6 +20,7 @@ from llama_stack.apis.safety import (
 from llama_stack.apis.safety.safety import ModerationObject, ModerationObjectResults
 from llama_stack.apis.shields import Shield
 from llama_stack.core.datatypes import Api
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import Role
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
@@ -132,6 +132,8 @@ Provide your safety assessment for ONLY THE LAST $agent_type message in the abov
 
 PROMPT_TEMPLATE = Template(f"{PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS}")
 
+logger = get_logger(name=__name__, category="safety")
+
 
 class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
     def __init__(self, config: LlamaGuardConfig, deps) -> None:
@@ -407,7 +409,7 @@ class LlamaGuardShield:
             unsafe_code_list = [code.strip() for code in unsafe_code.split(",")]
             invalid_codes = [code for code in unsafe_code_list if code not in SAFETY_CODE_TO_CATEGORIES_MAP]
             if invalid_codes:
-                logging.warning(f"Invalid safety codes returned: {invalid_codes}")
+                logger.warning(f"Invalid safety codes returned: {invalid_codes}")
                 # just returning safe object, as we don't know what the invalid codes can map to
                 return ModerationObject(
                     id=f"modr-{uuid.uuid4()}",
diff --git a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
index c760f0fd1..6fb6c4407 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from typing import Any
 
 import torch
@@ -21,6 +20,7 @@ from llama_stack.apis.safety import (
 from llama_stack.apis.safety.safety import ModerationObject
 from llama_stack.apis.shields import Shield
 from llama_stack.core.utils.model_utils import model_local_dir
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import (
     interleaved_content_as_str,
@@ -28,7 +28,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .config import PromptGuardConfig, PromptGuardType
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="safety")
 
 PROMPT_GUARD_MODEL = "Prompt-Guard-86M"
 
diff --git a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
index b74c3826e..c9358101d 100644
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
@@ -7,7 +7,6 @@
 import collections
 import functools
 import json
-import logging
 import random
 import re
 import string
@@ -20,7 +19,9 @@ import nltk
 from pythainlp.tokenize import sent_tokenize as sent_tokenize_thai
 from pythainlp.tokenize import word_tokenize as word_tokenize_thai
 
-logger = logging.getLogger()
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="scoring")
 
 WORD_LIST = [
     "western",
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index d99255c79..9224c3792 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -4,13 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
+import datetime
 import threading
 from typing import Any
 
 from opentelemetry import metrics, trace
-
-logger = logging.getLogger(__name__)
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.metrics import MeterProvider
@@ -40,6 +38,7 @@ from llama_stack.apis.telemetry import (
     UnstructuredLogEvent,
 )
 from llama_stack.core.datatypes import Api
+from llama_stack.log import get_logger
 from llama_stack.providers.inline.telemetry.meta_reference.console_span_processor import (
     ConsoleSpanProcessor,
 )
@@ -61,6 +60,8 @@ _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
 _global_lock = threading.Lock()
 _TRACER_PROVIDER = None
 
+logger = get_logger(name=__name__, category="telemetry")
+
 
 def is_tracing_enabled(tracer):
     with tracer.start_as_current_span("check_tracing") as span:
@@ -145,11 +146,41 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
         metric_name: str,
         start_time: int,
         end_time: int | None = None,
-        granularity: str | None = "1d",
+        granularity: str | None = None,
         query_type: MetricQueryType = MetricQueryType.RANGE,
         label_matchers: list[MetricLabelMatcher] | None = None,
     ) -> QueryMetricsResponse:
-        raise NotImplementedError("Querying metrics is not implemented")
+        """Query metrics from the telemetry store.
+
+        Args:
+            metric_name: The name of the metric to query (e.g., "prompt_tokens")
+            start_time: Start time as Unix timestamp
+            end_time: End time as Unix timestamp (defaults to now if None)
+            granularity: Time granularity for aggregation
+            query_type: Type of query (RANGE or INSTANT)
+            label_matchers: Label filters to apply
+
+        Returns:
+            QueryMetricsResponse with metric time series data
+        """
+        # Convert timestamps to datetime objects
+        start_dt = datetime.datetime.fromtimestamp(start_time, datetime.UTC)
+        end_dt = datetime.datetime.fromtimestamp(end_time, datetime.UTC) if end_time else None
+
+        # Use SQLite trace store if available
+        if hasattr(self, "trace_store") and self.trace_store:
+            return await self.trace_store.query_metrics(
+                metric_name=metric_name,
+                start_time=start_dt,
+                end_time=end_dt,
+                granularity=granularity,
+                query_type=query_type,
+                label_matchers=label_matchers,
+            )
+        else:
+            raise ValueError(
+                f"In order to query_metrics, you must have {TelemetrySink.SQLITE} set in your telemetry sinks"
+            )
 
     def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
         with self._lock:
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index 6a7c7885c..a1543457b 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import asyncio
-import logging
 import secrets
 import string
 from typing import Any
@@ -32,6 +31,7 @@ from llama_stack.apis.tools import (
     ToolRuntime,
 )
 from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
@@ -42,7 +42,7 @@ from llama_stack.providers.utils.memory.vector_store import (
 from .config import RagToolRuntimeConfig
 from .context_retriever import generate_rag_query
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="tool_runtime")
 
 
 def make_random_string(length: int = 8):
diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index af61da59b..258c6e7aa 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -8,7 +8,6 @@ import asyncio
 import base64
 import io
 import json
-import logging
 from typing import Any
 
 import faiss
@@ -24,6 +23,7 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
     HealthResponse,
     HealthStatus,
@@ -40,7 +40,7 @@ from llama_stack.providers.utils.memory.vector_store import (
 
 from .config import FaissVectorIOConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="vector_io")
 
 VERSION = "v3"
 VECTOR_DBS_PREFIX = f"vector_dbs:{VERSION}::"
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
index cc1982f3b..7cf163960 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import asyncio
-import logging
 import re
 import sqlite3
 import struct
@@ -24,6 +23,7 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
@@ -36,7 +36,7 @@ from llama_stack.providers.utils.memory.vector_store import (
     VectorDBWithIndex,
 )
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="vector_io")
 
 # Specifying search mode is dependent on the VectorIO provider.
 VECTOR_SEARCH = "vector"
diff --git a/llama_stack/providers/registry/files.py b/llama_stack/providers/registry/files.py
index e894debaf..ebe90310c 100644
--- a/llama_stack/providers/registry/files.py
+++ b/llama_stack/providers/registry/files.py
@@ -5,9 +5,11 @@
 # the root directory of this source tree.
 
 from llama_stack.providers.datatypes import (
+    AdapterSpec,
     Api,
     InlineProviderSpec,
     ProviderSpec,
+    remote_provider_spec,
 )
 from llama_stack.providers.utils.sqlstore.sqlstore import sql_store_pip_packages
 
@@ -23,4 +25,14 @@ def available_providers() -> list[ProviderSpec]:
             config_class="llama_stack.providers.inline.files.localfs.config.LocalfsFilesImplConfig",
             description="Local filesystem-based file storage provider for managing files and documents locally.",
         ),
+        remote_provider_spec(
+            api=Api.files,
+            adapter=AdapterSpec(
+                adapter_type="s3",
+                pip_packages=["boto3"] + sql_store_pip_packages,
+                module="llama_stack.providers.remote.files.s3",
+                config_class="llama_stack.providers.remote.files.s3.config.S3FilesImplConfig",
+                description="AWS S3-based file storage provider for scalable cloud file management with metadata persistence.",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py
index ffd64ef7c..4443f4df1 100644
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@@ -5,34 +5,74 @@
 # the root directory of this source tree.
 
 
+from typing import cast
+
 from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
 
+# We provide two versions of these providers so that distributions can package the appropriate version of torch.
+# The CPU version is used for distributions that don't have GPU support -- they result in smaller container images.
+torchtune_def = dict(
+    api=Api.post_training,
+    pip_packages=["torchtune==0.5.0", "torchao==0.8.0", "numpy"],
+    module="llama_stack.providers.inline.post_training.torchtune",
+    config_class="llama_stack.providers.inline.post_training.torchtune.TorchtunePostTrainingConfig",
+    api_dependencies=[
+        Api.datasetio,
+        Api.datasets,
+    ],
+    description="TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.",
+)
+
+huggingface_def = dict(
+    api=Api.post_training,
+    pip_packages=["trl", "transformers", "peft", "datasets"],
+    module="llama_stack.providers.inline.post_training.huggingface",
+    config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
+    api_dependencies=[
+        Api.datasetio,
+        Api.datasets,
+    ],
+    description="HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.",
+)
+
 
 def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
-            api=Api.post_training,
-            provider_type="inline::torchtune",
-            pip_packages=["torch", "torchtune==0.5.0", "torchao==0.8.0", "numpy"],
-            module="llama_stack.providers.inline.post_training.torchtune",
-            config_class="llama_stack.providers.inline.post_training.torchtune.TorchtunePostTrainingConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            description="TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.",
+            **{
+                **torchtune_def,
+                "provider_type": "inline::torchtune-cpu",
+                "pip_packages": (
+                    cast(list[str], torchtune_def["pip_packages"])
+                    + ["torch torchtune==0.5.0 torchao==0.8.0 --index-url https://download.pytorch.org/whl/cpu"]
+                ),
+            },
         ),
         InlineProviderSpec(
-            api=Api.post_training,
-            provider_type="inline::huggingface",
-            pip_packages=["torch", "trl", "transformers", "peft", "datasets"],
-            module="llama_stack.providers.inline.post_training.huggingface",
-            config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            description="HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.",
+            **{
+                **huggingface_def,
+                "provider_type": "inline::huggingface-cpu",
+                "pip_packages": (
+                    cast(list[str], huggingface_def["pip_packages"])
+                    + ["torch --index-url https://download.pytorch.org/whl/cpu"]
+                ),
+            },
+        ),
+        InlineProviderSpec(
+            **{
+                **torchtune_def,
+                "provider_type": "inline::torchtune-gpu",
+                "pip_packages": (
+                    cast(list[str], torchtune_def["pip_packages"]) + ["torch torchtune==0.5.0 torchao==0.8.0"]
+                ),
+            },
+        ),
+        InlineProviderSpec(
+            **{
+                **huggingface_def,
+                "provider_type": "inline::huggingface-gpu",
+                "pip_packages": (cast(list[str], huggingface_def["pip_packages"]) + ["torch"]),
+            },
         ),
         remote_provider_spec(
             api=Api.post_training,
diff --git a/llama_stack/providers/remote/files/s3/README.md b/llama_stack/providers/remote/files/s3/README.md
new file mode 100644
index 000000000..0f33122c7
--- /dev/null
+++ b/llama_stack/providers/remote/files/s3/README.md
@@ -0,0 +1,237 @@
+# S3 Files Provider
+
+A remote S3-based implementation of the Llama Stack Files API that provides scalable cloud file storage with metadata persistence.
+
+## Features
+
+- **AWS S3 Storage**: Store files in AWS S3 buckets for scalable, durable storage
+- **Metadata Management**: Uses SQL database for efficient file metadata queries
+- **OpenAI API Compatibility**: Full compatibility with OpenAI Files API endpoints
+- **Flexible Authentication**: Support for IAM roles and access keys
+- **Custom S3 Endpoints**: Support for MinIO and other S3-compatible services
+
+## Configuration
+
+### Basic Configuration
+
+```yaml
+api: files
+provider_type: remote::s3
+config:
+  bucket_name: my-llama-stack-files
+  region: us-east-1
+  metadata_store:
+    type: sqlite
+    db_path: ./s3_files_metadata.db
+```
+
+### Advanced Configuration
+
+```yaml
+api: files
+provider_type: remote::s3
+config:
+  bucket_name: my-llama-stack-files
+  region: us-east-1
+  aws_access_key_id: YOUR_ACCESS_KEY
+  aws_secret_access_key: YOUR_SECRET_KEY
+  endpoint_url: https://s3.amazonaws.com  # Optional for custom endpoints
+  metadata_store:
+    type: sqlite
+    db_path: ./s3_files_metadata.db
+```
+
+### Environment Variables
+
+The configuration supports environment variable substitution:
+
+```yaml
+config:
+  bucket_name: "${env.S3_BUCKET_NAME}"
+  region: "${env.AWS_REGION:=us-east-1}"
+  aws_access_key_id: "${env.AWS_ACCESS_KEY_ID:=}"
+  aws_secret_access_key: "${env.AWS_SECRET_ACCESS_KEY:=}"
+  endpoint_url: "${env.S3_ENDPOINT_URL:=}"
+```
+
+Note: `S3_BUCKET_NAME` has no default value since S3 bucket names must be globally unique.
+
+## Authentication
+
+### IAM Roles (Recommended)
+
+For production deployments, use IAM roles:
+
+```yaml
+config:
+  bucket_name: my-bucket
+  region: us-east-1
+  # No credentials needed - will use IAM role
+```
+
+### Access Keys
+
+For development or specific use cases:
+
+```yaml
+config:
+  bucket_name: my-bucket
+  region: us-east-1
+  aws_access_key_id: AKIAIOSFODNN7EXAMPLE
+  aws_secret_access_key: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
+```
+
+## S3 Bucket Setup
+
+### Required Permissions
+
+The S3 provider requires the following permissions:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "s3:GetObject",
+        "s3:PutObject",
+        "s3:DeleteObject",
+        "s3:ListBucket"
+      ],
+      "Resource": [
+        "arn:aws:s3:::your-bucket-name",
+        "arn:aws:s3:::your-bucket-name/*"
+      ]
+    }
+  ]
+}
+```
+
+### Automatic Bucket Creation
+
+By default, the S3 provider expects the bucket to already exist. If you want the provider to automatically create the bucket when it doesn't exist, set `auto_create_bucket: true` in your configuration:
+
+```yaml
+config:
+  bucket_name: my-bucket
+  auto_create_bucket: true  # Will create bucket if it doesn't exist
+  region: us-east-1
+```
+
+**Note**: When `auto_create_bucket` is enabled, the provider will need additional permissions:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "s3:GetObject",
+        "s3:PutObject",
+        "s3:DeleteObject",
+        "s3:ListBucket",
+        "s3:CreateBucket"
+      ],
+      "Resource": [
+        "arn:aws:s3:::your-bucket-name",
+        "arn:aws:s3:::your-bucket-name/*"
+      ]
+    }
+  ]
+}
+```
+
+### Bucket Policy (Optional)
+
+For additional security, you can add a bucket policy:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "LlamaStackAccess",
+      "Effect": "Allow",
+      "Principal": {
+        "AWS": "arn:aws:iam::YOUR-ACCOUNT:role/LlamaStackRole"
+      },
+      "Action": [
+        "s3:GetObject",
+        "s3:PutObject",
+        "s3:DeleteObject"
+      ],
+      "Resource": "arn:aws:s3:::your-bucket-name/*"
+    },
+    {
+      "Sid": "LlamaStackBucketAccess",
+      "Effect": "Allow",
+      "Principal": {
+        "AWS": "arn:aws:iam::YOUR-ACCOUNT:role/LlamaStackRole"
+      },
+      "Action": [
+        "s3:ListBucket"
+      ],
+      "Resource": "arn:aws:s3:::your-bucket-name"
+    }
+  ]
+}
+```
+
+## Features
+
+### Metadata Persistence
+
+File metadata is stored in a SQL database for fast queries and OpenAI API compatibility. The metadata includes:
+
+- File ID
+- Original filename
+- Purpose (assistants, batch, etc.)
+- File size in bytes
+- Created and expiration timestamps
+
+### TTL and Cleanup
+
+Files currently have a fixed long expiration time (100 years).
+
+## Development and Testing
+
+### Using MinIO
+
+For self-hosted S3-compatible storage:
+
+```yaml
+config:
+  bucket_name: test-bucket
+  region: us-east-1
+  endpoint_url: http://localhost:9000
+  aws_access_key_id: minioadmin
+  aws_secret_access_key: minioadmin
+```
+
+## Monitoring and Logging
+
+The provider logs important operations and errors. For production deployments, consider:
+
+- CloudWatch monitoring for S3 operations
+- Custom metrics for file upload/download rates
+- Error rate monitoring
+- Performance metrics tracking
+
+## Error Handling
+
+The provider handles various error scenarios:
+
+- S3 connectivity issues
+- Bucket access permissions
+- File not found errors
+- Metadata consistency checks
+
+## Known Limitations
+
+- Fixed long TTL (100 years) instead of configurable expiration
+- No server-side encryption enabled by default
+- No support for AWS session tokens
+- No S3 key prefix organization support
+- No multipart upload support (all files uploaded as single objects)
diff --git a/llama_stack/providers/remote/files/s3/__init__.py b/llama_stack/providers/remote/files/s3/__init__.py
new file mode 100644
index 000000000..3f5dfc88a
--- /dev/null
+++ b/llama_stack/providers/remote/files/s3/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.core.datatypes import Api
+
+from .config import S3FilesImplConfig
+
+
+async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any]):
+    from .files import S3FilesImpl
+
+    # TODO: authorization policies and user separation
+    impl = S3FilesImpl(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/remote/files/s3/config.py b/llama_stack/providers/remote/files/s3/config.py
new file mode 100644
index 000000000..da20d8668
--- /dev/null
+++ b/llama_stack/providers/remote/files/s3/config.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig
+
+
+class S3FilesImplConfig(BaseModel):
+    """Configuration for S3-based files provider."""
+
+    bucket_name: str = Field(description="S3 bucket name to store files")
+    region: str = Field(default="us-east-1", description="AWS region where the bucket is located")
+    aws_access_key_id: str | None = Field(default=None, description="AWS access key ID (optional if using IAM roles)")
+    aws_secret_access_key: str | None = Field(
+        default=None, description="AWS secret access key (optional if using IAM roles)"
+    )
+    endpoint_url: str | None = Field(default=None, description="Custom S3 endpoint URL (for MinIO, LocalStack, etc.)")
+    auto_create_bucket: bool = Field(
+        default=False, description="Automatically create the S3 bucket if it doesn't exist"
+    )
+    metadata_store: SqlStoreConfig = Field(description="SQL store configuration for file metadata")
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
+        return {
+            "bucket_name": "${env.S3_BUCKET_NAME}",  # no default, buckets must be globally unique
+            "region": "${env.AWS_REGION:=us-east-1}",
+            "aws_access_key_id": "${env.AWS_ACCESS_KEY_ID:=}",
+            "aws_secret_access_key": "${env.AWS_SECRET_ACCESS_KEY:=}",
+            "endpoint_url": "${env.S3_ENDPOINT_URL:=}",
+            "auto_create_bucket": "${env.S3_AUTO_CREATE_BUCKET:=false}",
+            "metadata_store": SqliteSqlStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="s3_files_metadata.db",
+            ),
+        }
diff --git a/llama_stack/providers/remote/files/s3/files.py b/llama_stack/providers/remote/files/s3/files.py
new file mode 100644
index 000000000..52e0cbbf4
--- /dev/null
+++ b/llama_stack/providers/remote/files/s3/files.py
@@ -0,0 +1,272 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+import uuid
+from typing import Annotated
+
+import boto3
+from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
+from fastapi import File, Form, Response, UploadFile
+
+from llama_stack.apis.common.errors import ResourceNotFoundError
+from llama_stack.apis.common.responses import Order
+from llama_stack.apis.files import (
+    Files,
+    ListOpenAIFileResponse,
+    OpenAIFileDeleteResponse,
+    OpenAIFileObject,
+    OpenAIFilePurpose,
+)
+from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
+from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+
+from .config import S3FilesImplConfig
+
+# TODO: provider data for S3 credentials
+
+
+def _create_s3_client(config: S3FilesImplConfig) -> boto3.client:
+    try:
+        s3_config = {
+            "region_name": config.region,
+        }
+
+        # endpoint URL if specified (for MinIO, LocalStack, etc.)
+        if config.endpoint_url:
+            s3_config["endpoint_url"] = config.endpoint_url
+
+        if config.aws_access_key_id and config.aws_secret_access_key:
+            s3_config.update(
+                {
+                    "aws_access_key_id": config.aws_access_key_id,
+                    "aws_secret_access_key": config.aws_secret_access_key,
+                }
+            )
+
+        return boto3.client("s3", **s3_config)
+
+    except (BotoCoreError, NoCredentialsError) as e:
+        raise RuntimeError(f"Failed to initialize S3 client: {e}") from e
+
+
+async def _create_bucket_if_not_exists(client: boto3.client, config: S3FilesImplConfig) -> None:
+    try:
+        client.head_bucket(Bucket=config.bucket_name)
+    except ClientError as e:
+        error_code = e.response["Error"]["Code"]
+        if error_code == "404":
+            if not config.auto_create_bucket:
+                raise RuntimeError(
+                    f"S3 bucket '{config.bucket_name}' does not exist. "
+                    f"Either create the bucket manually or set 'auto_create_bucket: true' in your configuration."
+                ) from e
+            try:
+                # For us-east-1, we can't specify LocationConstraint
+                if config.region == "us-east-1":
+                    client.create_bucket(Bucket=config.bucket_name)
+                else:
+                    client.create_bucket(
+                        Bucket=config.bucket_name,
+                        CreateBucketConfiguration={"LocationConstraint": config.region},
+                    )
+            except ClientError as create_error:
+                raise RuntimeError(
+                    f"Failed to create S3 bucket '{config.bucket_name}': {create_error}"
+                ) from create_error
+        elif error_code == "403":
+            raise RuntimeError(f"Access denied to S3 bucket '{config.bucket_name}'") from e
+        else:
+            raise RuntimeError(f"Failed to access S3 bucket '{config.bucket_name}': {e}") from e
+
+
+class S3FilesImpl(Files):
+    """S3-based implementation of the Files API."""
+
+    # TODO: implement expiration, for now a silly offset
+    _SILLY_EXPIRATION_OFFSET = 100 * 365 * 24 * 60 * 60
+
+    def __init__(self, config: S3FilesImplConfig) -> None:
+        self._config = config
+        self._client: boto3.client | None = None
+        self._sql_store: SqlStore | None = None
+
+    async def initialize(self) -> None:
+        self._client = _create_s3_client(self._config)
+        await _create_bucket_if_not_exists(self._client, self._config)
+
+        self._sql_store = sqlstore_impl(self._config.metadata_store)
+        await self._sql_store.create_table(
+            "openai_files",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "filename": ColumnType.STRING,
+                "purpose": ColumnType.STRING,
+                "bytes": ColumnType.INTEGER,
+                "created_at": ColumnType.INTEGER,
+                "expires_at": ColumnType.INTEGER,
+                # TODO: add s3_etag field for integrity checking
+            },
+        )
+
+    async def shutdown(self) -> None:
+        pass
+
+    @property
+    def client(self) -> boto3.client:
+        assert self._client is not None, "Provider not initialized"
+        return self._client
+
+    @property
+    def sql_store(self) -> SqlStore:
+        assert self._sql_store is not None, "Provider not initialized"
+        return self._sql_store
+
+    async def openai_upload_file(
+        self,
+        file: Annotated[UploadFile, File()],
+        purpose: Annotated[OpenAIFilePurpose, Form()],
+    ) -> OpenAIFileObject:
+        file_id = f"file-{uuid.uuid4().hex}"
+
+        filename = getattr(file, "filename", None) or "uploaded_file"
+
+        created_at = int(time.time())
+        expires_at = created_at + self._SILLY_EXPIRATION_OFFSET
+        content = await file.read()
+        file_size = len(content)
+
+        await self.sql_store.insert(
+            "openai_files",
+            {
+                "id": file_id,
+                "filename": filename,
+                "purpose": purpose.value,
+                "bytes": file_size,
+                "created_at": created_at,
+                "expires_at": expires_at,
+            },
+        )
+
+        try:
+            self.client.put_object(
+                Bucket=self._config.bucket_name,
+                Key=file_id,
+                Body=content,
+                # TODO: enable server-side encryption
+            )
+        except ClientError as e:
+            await self.sql_store.delete("openai_files", where={"id": file_id})
+
+            raise RuntimeError(f"Failed to upload file to S3: {e}") from e
+
+        return OpenAIFileObject(
+            id=file_id,
+            filename=filename,
+            purpose=purpose,
+            bytes=file_size,
+            created_at=created_at,
+            expires_at=expires_at,
+        )
+
+    async def openai_list_files(
+        self,
+        after: str | None = None,
+        limit: int | None = 10000,
+        order: Order | None = Order.desc,
+        purpose: OpenAIFilePurpose | None = None,
+    ) -> ListOpenAIFileResponse:
+        # this purely defensive. it should not happen because the router also default to Order.desc.
+        if not order:
+            order = Order.desc
+
+        where_conditions = {}
+        if purpose:
+            where_conditions["purpose"] = purpose.value
+
+        paginated_result = await self.sql_store.fetch_all(
+            table="openai_files",
+            where=where_conditions if where_conditions else None,
+            order_by=[("created_at", order.value)],
+            cursor=("id", after) if after else None,
+            limit=limit,
+        )
+
+        files = [
+            OpenAIFileObject(
+                id=row["id"],
+                filename=row["filename"],
+                purpose=OpenAIFilePurpose(row["purpose"]),
+                bytes=row["bytes"],
+                created_at=row["created_at"],
+                expires_at=row["expires_at"],
+            )
+            for row in paginated_result.data
+        ]
+
+        return ListOpenAIFileResponse(
+            data=files,
+            has_more=paginated_result.has_more,
+            # empty string or None? spec says str, ref impl returns str | None, we go with spec
+            first_id=files[0].id if files else "",
+            last_id=files[-1].id if files else "",
+        )
+
+    async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ResourceNotFoundError(file_id, "File", "files.list()")
+
+        return OpenAIFileObject(
+            id=row["id"],
+            filename=row["filename"],
+            purpose=OpenAIFilePurpose(row["purpose"]),
+            bytes=row["bytes"],
+            created_at=row["created_at"],
+            expires_at=row["expires_at"],
+        )
+
+    async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ResourceNotFoundError(file_id, "File", "files.list()")
+
+        try:
+            self.client.delete_object(
+                Bucket=self._config.bucket_name,
+                Key=row["id"],
+            )
+        except ClientError as e:
+            if e.response["Error"]["Code"] != "NoSuchKey":
+                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
+
+        await self.sql_store.delete("openai_files", where={"id": file_id})
+
+        return OpenAIFileDeleteResponse(id=file_id, deleted=True)
+
+    async def openai_retrieve_file_content(self, file_id: str) -> Response:
+        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        if not row:
+            raise ResourceNotFoundError(file_id, "File", "files.list()")
+
+        try:
+            response = self.client.get_object(
+                Bucket=self._config.bucket_name,
+                Key=row["id"],
+            )
+            # TODO: can we stream this instead of loading it into memory
+            content = response["Body"].read()
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "NoSuchKey":
+                await self.sql_store.delete("openai_files", where={"id": file_id})
+                raise ResourceNotFoundError(file_id, "File", "files.list()") from e
+            raise RuntimeError(f"Failed to download file from S3: {e}") from e
+
+        return Response(
+            content=content,
+            media_type="application/octet-stream",
+            headers={"Content-Disposition": f'attachment; filename="{row["filename"]}"'},
+        )
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index bd86f7238..e907e8ec6 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -65,7 +65,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import FireworksImplConfig
 from .models import MODEL_ENTRIES
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="inference::fireworks")
 
 
 class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
index 4857c6723..f2069b5e5 100644
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@@ -3,15 +3,14 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import logging
-
+from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 
 from .models import MODEL_ENTRIES
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="inference::llama_openai_compat")
 
 
 class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
index 4a072215c..d96b29fef 100644
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -41,6 +41,11 @@ client.initialize()
 
 ### Create Completion
 
+> Note on Completion API
+>
+> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
+
+
 ```python
 response = client.inference.completion(
     model_id="meta-llama/Llama-3.1-8B-Instruct",
@@ -76,7 +81,78 @@ response = client.inference.chat_completion(
 print(f"Response: {response.completion_message.content}")
 ```
 
+### Tool Calling Example ###
+```python
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+
+tool_definition = ToolDefinition(
+    tool_name="get_weather",
+    description="Get current weather information for a location",
+    parameters={
+        "location": ToolParamDefinition(
+            param_type="string",
+            description="The city and state, e.g. San Francisco, CA",
+            required=True,
+        ),
+        "unit": ToolParamDefinition(
+            param_type="string",
+            description="Temperature unit (celsius or fahrenheit)",
+            required=False,
+            default="celsius",
+        ),
+    },
+)
+
+tool_response = client.inference.chat_completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    tools=[tool_definition],
+)
+
+print(f"Tool Response: {tool_response.completion_message.content}")
+if tool_response.completion_message.tool_calls:
+    for tool_call in tool_response.completion_message.tool_calls:
+        print(f"Tool Called: {tool_call.tool_name}")
+        print(f"Arguments: {tool_call.arguments}")
+```
+
+### Structured Output Example
+```python
+from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
+
+person_schema = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
+        "occupation": {"type": "string"},
+    },
+    "required": ["name", "age", "occupation"],
+}
+
+response_format = JsonSchemaResponseFormat(
+    type=ResponseFormatType.json_schema, json_schema=person_schema
+)
+
+structured_response = client.inference.chat_completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Create a profile for a fictional person named Alice who is 30 years old and is a software engineer. ",
+        }
+    ],
+    response_format=response_format,
+)
+
+print(f"Structured Response: {structured_response.completion_message.content}")
+```
+
 ### Create Embeddings
+> Note on OpenAI embeddings compatibility
+>
+> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
+
 ```python
 response = client.inference.embeddings(
     model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 7bc3fd0c9..a5475bc92 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import warnings
 from collections.abc import AsyncIterator
 
-from openai import APIConnectionError, BadRequestError
+from openai import NOT_GIVEN, APIConnectionError
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
@@ -27,12 +26,16 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
     ToolChoice,
     ToolConfig,
 )
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolPromptFormat
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
@@ -54,7 +57,7 @@ from .openai_utils import (
 )
 from .utils import _is_nvidia_hosted
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="inference::nvidia")
 
 
 class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
@@ -194,15 +197,11 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
             }
             extra_body["input_type"] = task_type_options[task_type]
 
-        try:
-            response = await self.client.embeddings.create(
-                model=provider_model_id,
-                input=input,
-                extra_body=extra_body,
-            )
-        except BadRequestError as e:
-            raise ValueError(f"Failed to get embeddings: {e}") from e
-
+        response = await self.client.embeddings.create(
+            model=provider_model_id,
+            input=input,
+            extra_body=extra_body,
+        )
         #
         # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
         #  ->
@@ -210,6 +209,57 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper):
         #
         return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        """
+        OpenAI-compatible embeddings for NVIDIA NIM.
+
+        Note: NVIDIA NIM asymmetric embedding models require an "input_type" field not present in the standard OpenAI embeddings API.
+        We default this to "query" to ensure requests succeed when using the
+        OpenAI-compatible endpoint. For passage embeddings, use the embeddings API with
+        `task_type='document'`.
+        """
+        extra_body: dict[str, object] = {"input_type": "query"}
+        logger.warning(
+            "NVIDIA OpenAI-compatible embeddings: defaulting to input_type='query'. "
+            "For passage embeddings, use the embeddings API with task_type='document'."
+        )
+
+        response = await self.client.embeddings.create(
+            model=await self._get_provider_model_id(model),
+            input=input,
+            encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN,
+            dimensions=dimensions if dimensions is not None else NOT_GIVEN,
+            user=user if user is not None else NOT_GIVEN,
+            extra_body=extra_body,
+        )
+
+        data = []
+        for i, embedding_data in enumerate(response.data):
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_data.embedding,
+                    index=i,
+                )
+            )
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=response.model,
+            usage=usage,
+        )
+
     async def chat_completion(
         self,
         model_id: str,
diff --git a/llama_stack/providers/remote/inference/nvidia/utils.py b/llama_stack/providers/remote/inference/nvidia/utils.py
index 74019999e..b8431e859 100644
--- a/llama_stack/providers/remote/inference/nvidia/utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/utils.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
-
 import httpx
 
+from llama_stack.log import get_logger
+
 from . import NVIDIAConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="inference::nvidia")
 
 
 def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index a93421536..fcaf5ee92 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -85,7 +85,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .models import MODEL_ENTRIES
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="inference::ollama")
 
 
 class OllamaInferenceAdapter(
@@ -619,28 +619,6 @@ class OllamaInferenceAdapter(
             response.id = id
             return response
 
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch completion is not supported for Ollama")
-
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch chat completion is not supported for Ollama")
-
 
 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
     async def _convert_content(content) -> dict:
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 865258559..0f73c9321 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -4,15 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
-
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 
 from .config import OpenAIConfig
 from .models import MODEL_ENTRIES
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="inference::openai")
 
 
 #
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 323831845..97c72d14c 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 
-import logging
 from collections.abc import AsyncGenerator
 
 from huggingface_hub import AsyncInferenceClient, HfApi
@@ -34,6 +33,7 @@ from llama_stack.apis.inference import (
     ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
+from llama_stack.log import get_logger
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
@@ -58,7 +58,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="inference::tgi")
 
 
 def build_hf_repo_model_entries():
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index a06e4173b..54c76607f 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -61,7 +61,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import TogetherImplConfig
 from .models import MODEL_ENTRIES
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="inference::together")
 
 
 class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index ac626874c..9e9a80ca5 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -85,7 +85,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .config import VLLMInferenceAdapterConfig
 
-log = get_logger(name=__name__, category="inference")
+log = get_logger(name=__name__, category="inference::vllm")
 
 
 def build_hf_repo_model_entries():
@@ -711,25 +711,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             user=user,
         )
         return await self.client.chat.completions.create(**params)  # type: ignore
-
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch completion is not supported for Ollama")
-
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch chat completion is not supported for Ollama")
diff --git a/llama_stack/providers/remote/post_training/nvidia/utils.py b/llama_stack/providers/remote/post_training/nvidia/utils.py
index d6e1016b2..162951ff3 100644
--- a/llama_stack/providers/remote/post_training/nvidia/utils.py
+++ b/llama_stack/providers/remote/post_training/nvidia/utils.py
@@ -4,18 +4,18 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import warnings
 from typing import Any
 
 from pydantic import BaseModel
 
 from llama_stack.apis.post_training import TrainingConfig
+from llama_stack.log import get_logger
 from llama_stack.providers.remote.post_training.nvidia.config import SFTLoRADefaultConfig
 
 from .config import NvidiaPostTrainingConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="post_training::nvidia")
 
 
 def warn_unsupported_params(config_dict: Any, supported_keys: set[str], config_name: str) -> None:
diff --git a/llama_stack/providers/remote/safety/bedrock/bedrock.py b/llama_stack/providers/remote/safety/bedrock/bedrock.py
index 1895e7507..8855e02a4 100644
--- a/llama_stack/providers/remote/safety/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/safety/bedrock/bedrock.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import json
-import logging
 from typing import Any
 
 from llama_stack.apis.inference import Message
@@ -16,12 +15,13 @@ from llama_stack.apis.safety import (
     ViolationLevel,
 )
 from llama_stack.apis.shields import Shield
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.bedrock.client import create_bedrock_client
 
 from .config import BedrockSafetyConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="safety::bedrock")
 
 
 class BedrockSafetyAdapter(Safety, ShieldsProtocolPrivate):
diff --git a/llama_stack/providers/remote/safety/nvidia/nvidia.py b/llama_stack/providers/remote/safety/nvidia/nvidia.py
index 7f17b1cb6..65f901da2 100644
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@@ -4,20 +4,20 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from typing import Any
 
 import requests
 
 from llama_stack.apis.inference import Message
-from llama_stack.apis.safety import RunShieldResponse, Safety, SafetyViolation, ViolationLevel
+from llama_stack.apis.safety import ModerationObject, RunShieldResponse, Safety, SafetyViolation, ViolationLevel
 from llama_stack.apis.shields import Shield
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict_new
 
 from .config import NVIDIASafetyConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="safety::nvidia")
 
 
 class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
@@ -67,6 +67,9 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
         self.shield = NeMoGuardrails(self.config, shield.shield_id)
         return await self.shield.run(messages)
 
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
+        raise NotImplementedError("NVIDIA safety provider currently does not implement run_moderation")
+
 
 class NeMoGuardrails:
     """
diff --git a/llama_stack/providers/remote/safety/sambanova/sambanova.py b/llama_stack/providers/remote/safety/sambanova/sambanova.py
index 6c7190afe..2beb5e0ea 100644
--- a/llama_stack/providers/remote/safety/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/safety/sambanova/sambanova.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import json
-import logging
 from typing import Any
 
 import litellm
@@ -20,12 +19,13 @@ from llama_stack.apis.safety import (
 )
 from llama_stack.apis.shields import Shield
 from llama_stack.core.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict_new
 
 from .config import SambaNovaSafetyConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="safety::sambanova")
 
 CANNED_RESPONSE_TEXT = "I can't answer that. Can I help with something else?"
 
diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py
index 8f252711b..a9ec644ef 100644
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 import asyncio
 import json
-import logging
 from typing import Any
 from urllib.parse import urlparse
 
@@ -20,6 +19,7 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
 from llama_stack.providers.utils.kvstore import kvstore_impl
@@ -33,7 +33,7 @@ from llama_stack.providers.utils.memory.vector_store import (
 
 from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="vector_io::chroma")
 
 ChromaClientType = chromadb.api.AsyncClientAPI | chromadb.api.ClientAPI
 
diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py
index c659bdf6c..e07e8ff12 100644
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import asyncio
-import logging
 import os
 from typing import Any
 
@@ -21,6 +20,7 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
 from llama_stack.providers.utils.kvstore import kvstore_impl
@@ -36,7 +36,7 @@ from llama_stack.providers.utils.vector_io.vector_utils import sanitize_collecti
 
 from .config import MilvusVectorIOConfig as RemoteMilvusVectorIOConfig
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="vector_io::milvus")
 
 VERSION = "v3"
 VECTOR_DBS_PREFIX = f"vector_dbs:milvus:{VERSION}::"
diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
index d2a5d910b..1c8d361c2 100644
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from typing import Any
 
 import psycopg2
@@ -22,6 +21,7 @@ from llama_stack.apis.vector_io import (
     QueryChunksResponse,
     VectorIO,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
@@ -34,7 +34,7 @@ from llama_stack.providers.utils.memory.vector_store import (
 
 from .config import PGVectorVectorIOConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="vector_io::pgvector")
 
 VERSION = "v3"
 VECTOR_DBS_PREFIX = f"vector_dbs:pgvector:{VERSION}::"
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 018015780..0a0faa23a 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import asyncio
-import logging
 import uuid
 from typing import Any
 
@@ -24,6 +23,7 @@ from llama_stack.apis.vector_io import (
     VectorStoreChunkingStrategy,
     VectorStoreFileObject,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
@@ -36,7 +36,7 @@ from llama_stack.providers.utils.memory.vector_store import (
 
 from .config import QdrantVectorIOConfig as RemoteQdrantVectorIOConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="vector_io::qdrant")
 CHUNK_ID_KEY = "_chunk_id"
 
 # KV store prefixes for vector databases
diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
index 966724848..59b6bf124 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-import logging
 from typing import Any
 
 import weaviate
@@ -19,6 +18,7 @@ from llama_stack.apis.files.files import Files
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
 from llama_stack.core.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
@@ -34,7 +34,7 @@ from llama_stack.providers.utils.vector_io.vector_utils import sanitize_collecti
 
 from .config import WeaviateVectorIOConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="vector_io::weaviate")
 
 VERSION = "v3"
 VECTOR_DBS_PREFIX = f"vector_dbs:weaviate:{VERSION}::"
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 32e89f987..65ba2854b 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -5,10 +5,11 @@
 # the root directory of this source tree.
 
 import base64
-import logging
 import struct
 from typing import TYPE_CHECKING
 
+from llama_stack.log import get_logger
+
 if TYPE_CHECKING:
     from sentence_transformers import SentenceTransformer
 
@@ -27,7 +28,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_con
 EMBEDDING_MODELS = {}
 
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="providers::utils")
 
 
 class SentenceTransformerEmbeddingMixin:
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index da2e634f6..9bd43e4c9 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -54,7 +54,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
     interleaved_content_as_str,
 )
 
-logger = get_logger(name=__name__, category="inference")
+logger = get_logger(name=__name__, category="providers::utils")
 
 
 class LiteLLMOpenAIMixin(
@@ -429,28 +429,6 @@ class LiteLLMOpenAIMixin(
         )
         return await litellm.acompletion(**params)
 
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch completion is not supported for OpenAI Compat")
-
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ):
-        raise NotImplementedError("Batch chat completion is not supported for OpenAI Compat")
-
     async def check_model_availability(self, model: str) -> bool:
         """
         Check if a specific model is available via LiteLLM for the current
diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py
index ddb3bda8c..44add8f9e 100644
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@@ -17,7 +17,7 @@ from llama_stack.providers.utils.inference import (
     ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR,
 )
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="providers::utils")
 
 
 class RemoteInferenceProviderConfig(BaseModel):
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index 5e6c26884..55c2ac0ad 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 import base64
 import json
-import logging
 import struct
 import time
 import uuid
@@ -122,6 +121,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.inference import (
     OpenAIChoice as OpenAIChatCompletionChoice,
 )
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
     BuiltinTool,
     StopReason,
@@ -134,7 +134,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
     decode_assistant_message,
 )
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="providers::utils")
 
 
 class OpenAICompatCompletionChoiceDelta(BaseModel):
diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py
index 72286dffb..f60deee6e 100644
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@@ -25,7 +25,7 @@ from llama_stack.apis.inference import (
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 
-logger = get_logger(name=__name__, category="core")
+logger = get_logger(name=__name__, category="providers::utils")
 
 
 class OpenAIMixin(ABC):
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index bb9a91b97..a93326e41 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -58,7 +58,7 @@ from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
 from llama_stack.providers.utils.inference import supported_inference_models
 
-log = get_logger(name=__name__, category="inference")
+log = get_logger(name=__name__, category="providers::utils")
 
 
 class ChatCompletionRequestWithRawContent(ChatCompletionRequest):
diff --git a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
index 3842773d9..bab87a4aa 100644
--- a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from datetime import datetime
 
 from pymongo import AsyncMongoClient
 
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore
 
 from ..config import MongoDBKVStoreConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="providers::utils")
 
 
 class MongoDBKVStoreImpl(KVStore):
diff --git a/llama_stack/providers/utils/kvstore/postgres/postgres.py b/llama_stack/providers/utils/kvstore/postgres/postgres.py
index cabb4c512..56d6dbb48 100644
--- a/llama_stack/providers/utils/kvstore/postgres/postgres.py
+++ b/llama_stack/providers/utils/kvstore/postgres/postgres.py
@@ -4,16 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 from datetime import datetime
 
 import psycopg2
 from psycopg2.extras import DictCursor
 
+from llama_stack.log import get_logger
+
 from ..api import KVStore
 from ..config import PostgresKVStoreConfig
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="providers::utils")
 
 
 class PostgresKVStoreImpl(KVStore):
diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
index 120d0d4fc..3acdcf293 100644
--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@@ -44,7 +44,7 @@ from llama_stack.providers.utils.memory.vector_store import (
     make_overlapped_chunks,
 )
 
-logger = get_logger(__name__, category="vector_io")
+logger = get_logger(name=__name__, category="providers::utils")
 
 # Constants for OpenAI vector stores
 CHUNK_MULTIPLIER = 5
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 6ae5bb521..b74080384 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 import base64
 import io
-import logging
 import re
 import time
 from abc import ABC, abstractmethod
@@ -26,6 +25,7 @@ from llama_stack.apis.common.content_types import (
 from llama_stack.apis.tools import RAGDocument
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
+from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import Api
 from llama_stack.providers.utils.inference.prompt_adapter import (
@@ -33,7 +33,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )
 from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="providers::utils")
 
 
 class ChunkForDeletion(BaseModel):
diff --git a/llama_stack/providers/utils/scheduler.py b/llama_stack/providers/utils/scheduler.py
index 65c3d2898..146591b2f 100644
--- a/llama_stack/providers/utils/scheduler.py
+++ b/llama_stack/providers/utils/scheduler.py
@@ -17,7 +17,7 @@ from pydantic import BaseModel
 
 from llama_stack.log import get_logger
 
-logger = get_logger(name=__name__, category="scheduler")
+logger = get_logger(name=__name__, category="providers::utils")
 
 
 # TODO: revisit the list of possible statuses when defining a more coherent
diff --git a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
index ccc835768..867ba2f55 100644
--- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
@@ -17,7 +17,7 @@ from llama_stack.log import get_logger
 from .api import ColumnDefinition, ColumnType, PaginatedResponse, SqlStore
 from .sqlstore import SqlStoreType
 
-logger = get_logger(name=__name__, category="authorized_sqlstore")
+logger = get_logger(name=__name__, category="providers::utils")
 
 # Hardcoded copy of the default policy that our SQL filtering implements
 # WARNING: If default_policy() changes, this constant must be updated accordingly
diff --git a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
index 6414929db..f75c35314 100644
--- a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@@ -22,6 +22,7 @@ from sqlalchemy import (
     text,
 )
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
+from sqlalchemy.ext.asyncio.engine import AsyncEngine
 
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.log import get_logger
@@ -29,7 +30,7 @@ from llama_stack.log import get_logger
 from .api import ColumnDefinition, ColumnType, SqlStore
 from .sqlstore import SqlAlchemySqlStoreConfig
 
-logger = get_logger(name=__name__, category="sqlstore")
+logger = get_logger(name=__name__, category="providers::utils")
 
 TYPE_MAPPING: dict[ColumnType, Any] = {
     ColumnType.INTEGER: Integer,
@@ -45,9 +46,12 @@ TYPE_MAPPING: dict[ColumnType, Any] = {
 class SqlAlchemySqlStoreImpl(SqlStore):
     def __init__(self, config: SqlAlchemySqlStoreConfig):
         self.config = config
-        self.async_session = async_sessionmaker(create_async_engine(config.engine_str))
+        self.async_session = async_sessionmaker(self.create_engine())
         self.metadata = MetaData()
 
+    def create_engine(self) -> AsyncEngine:
+        return create_async_engine(self.config.engine_str, pool_pre_ping=True)
+
     async def create_table(
         self,
         table: str,
@@ -83,7 +87,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
         else:
             sqlalchemy_table = self.metadata.tables[table]
 
-        engine = create_async_engine(self.config.engine_str)
+        engine = self.create_engine()
         async with engine.begin() as conn:
             await conn.run_sync(self.metadata.create_all, tables=[sqlalchemy_table], checkfirst=True)
 
@@ -241,7 +245,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
         nullable: bool = True,
     ) -> None:
         """Add a column to an existing table if the column doesn't already exist."""
-        engine = create_async_engine(self.config.engine_str)
+        engine = self.create_engine()
 
         try:
             async with engine.begin() as conn:
diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
index 8dd6061a6..71480364c 100644
--- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
+++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
@@ -5,12 +5,23 @@
 # the root directory of this source tree.
 
 import json
-from datetime import datetime
+from datetime import UTC, datetime
 from typing import Protocol
 
 import aiosqlite
 
-from llama_stack.apis.telemetry import QueryCondition, Span, SpanWithStatus, Trace
+from llama_stack.apis.telemetry import (
+    MetricDataPoint,
+    MetricLabel,
+    MetricLabelMatcher,
+    MetricQueryType,
+    MetricSeries,
+    QueryCondition,
+    QueryMetricsResponse,
+    Span,
+    SpanWithStatus,
+    Trace,
+)
 
 
 class TraceStore(Protocol):
@@ -29,11 +40,192 @@ class TraceStore(Protocol):
         max_depth: int | None = None,
     ) -> dict[str, SpanWithStatus]: ...
 
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: datetime,
+        end_time: datetime | None = None,
+        granularity: str | None = "1d",
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse: ...
+
 
 class SQLiteTraceStore(TraceStore):
     def __init__(self, conn_string: str):
         self.conn_string = conn_string
 
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: datetime,
+        end_time: datetime | None = None,
+        granularity: str | None = None,
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse:
+        if end_time is None:
+            end_time = datetime.now(UTC)
+
+        # Build base query
+        if query_type == MetricQueryType.INSTANT:
+            query = """
+                SELECT
+                    se.name,
+                    SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
+                    json_extract(se.attributes, '$.unit') as unit,
+                    se.attributes
+                FROM span_events se
+                WHERE se.name = ?
+                AND se.timestamp BETWEEN ? AND ?
+            """
+        else:
+            if granularity:
+                time_format = self._get_time_format_for_granularity(granularity)
+                query = f"""
+                    SELECT
+                        se.name,
+                        SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
+                        json_extract(se.attributes, '$.unit') as unit,
+                        se.attributes,
+                        strftime('{time_format}', se.timestamp) as bucket_start
+                    FROM span_events se
+                    WHERE se.name = ?
+                    AND se.timestamp BETWEEN ? AND ?
+                """
+            else:
+                query = """
+                    SELECT
+                        se.name,
+                        json_extract(se.attributes, '$.value') as value,
+                        json_extract(se.attributes, '$.unit') as unit,
+                        se.attributes,
+                        se.timestamp
+                    FROM span_events se
+                    WHERE se.name = ?
+                    AND se.timestamp BETWEEN ? AND ?
+                """
+
+        params = [f"metric.{metric_name}", start_time.isoformat(), end_time.isoformat()]
+
+        # Labels that will be attached to the MetricSeries (preserve matcher labels)
+        all_labels: list[MetricLabel] = []
+        matcher_label_names = set()
+        if label_matchers:
+            for matcher in label_matchers:
+                json_path = f"$.{matcher.name}"
+                if matcher.operator == "=":
+                    query += f" AND json_extract(se.attributes, '{json_path}') = ?"
+                    params.append(matcher.value)
+                elif matcher.operator == "!=":
+                    query += f" AND json_extract(se.attributes, '{json_path}') != ?"
+                    params.append(matcher.value)
+                elif matcher.operator == "=~":
+                    query += f" AND json_extract(se.attributes, '{json_path}') LIKE ?"
+                    params.append(f"%{matcher.value}%")
+                elif matcher.operator == "!~":
+                    query += f" AND json_extract(se.attributes, '{json_path}') NOT LIKE ?"
+                    params.append(f"%{matcher.value}%")
+                # Preserve filter context in output
+                all_labels.append(MetricLabel(name=matcher.name, value=str(matcher.value)))
+                matcher_label_names.add(matcher.name)
+
+        # GROUP BY / ORDER BY logic
+        if query_type == MetricQueryType.RANGE and granularity:
+            group_time_format = self._get_time_format_for_granularity(granularity)
+            query += f" GROUP BY strftime('{group_time_format}', se.timestamp), json_extract(se.attributes, '$.unit')"
+            query += " ORDER BY bucket_start"
+        elif query_type == MetricQueryType.INSTANT:
+            query += " GROUP BY json_extract(se.attributes, '$.unit')"
+        else:
+            query += " ORDER BY se.timestamp"
+
+        # Execute query
+        async with aiosqlite.connect(self.conn_string) as conn:
+            conn.row_factory = aiosqlite.Row
+            async with conn.execute(query, params) as cursor:
+                rows = await cursor.fetchall()
+
+                if not rows:
+                    return QueryMetricsResponse(data=[])
+
+                data_points = []
+                # We want to add attribute labels, but only those not already present as matcher labels.
+                attr_label_names = set()
+                for row in rows:
+                    # Parse JSON attributes safely, if there are no attributes (weird), just don't add the labels to the result.
+                    try:
+                        attributes = json.loads(row["attributes"] or "{}")
+                    except (TypeError, json.JSONDecodeError):
+                        attributes = {}
+
+                    value = row["value"]
+                    unit = row["unit"] or ""
+
+                    # Add labels from attributes without duplicating matcher labels, if we don't do this, there will be a lot of duplicate label in the result.
+                    for k, v in attributes.items():
+                        if k not in ["value", "unit"] and k not in matcher_label_names and k not in attr_label_names:
+                            all_labels.append(MetricLabel(name=k, value=str(v)))
+                            attr_label_names.add(k)
+
+                    # Determine timestamp
+                    if query_type == MetricQueryType.RANGE and granularity:
+                        try:
+                            bucket_start_raw = row["bucket_start"]
+                        except KeyError as e:
+                            raise ValueError(
+                                "DB did not have a bucket_start time in row when using granularity, this indicates improper formatting"
+                            ) from e
+                        # this value could also be there, but be NULL, I think.
+                        if bucket_start_raw is None:
+                            raise ValueError("bucket_start is None check time format and data")
+                        bucket_start = datetime.fromisoformat(bucket_start_raw)
+                        timestamp = int(bucket_start.timestamp())
+                    elif query_type == MetricQueryType.INSTANT:
+                        timestamp = int(datetime.now(UTC).timestamp())
+                    else:
+                        try:
+                            timestamp_raw = row["timestamp"]
+                        except KeyError as e:
+                            raise ValueError(
+                                "DB did not have a timestamp in row, this indicates improper formatting"
+                            ) from e
+                        # this value could also be there, but be NULL, I think.
+                        if timestamp_raw is None:
+                            raise ValueError("timestamp is None check time format and data")
+                        timestamp_iso = datetime.fromisoformat(timestamp_raw)
+                        timestamp = int(timestamp_iso.timestamp())
+
+                    data_points.append(
+                        MetricDataPoint(
+                            timestamp=timestamp,
+                            value=value,
+                            unit=unit,
+                        )
+                    )
+
+                metric_series = [MetricSeries(metric=metric_name, labels=all_labels, values=data_points)]
+                return QueryMetricsResponse(data=metric_series)
+
+    def _get_time_format_for_granularity(self, granularity: str | None) -> str:
+        """Get the SQLite strftime format string for a given granularity.
+        Args:
+            granularity: Granularity string (e.g., "1m", "5m", "1h", "1d")
+        Returns:
+            SQLite strftime format string for the granularity
+        """
+        if granularity is None:
+            raise ValueError("granularity cannot be None for this method - use separate logic for no aggregation")
+
+        if granularity.endswith("d"):
+            return "%Y-%m-%d 00:00:00"
+        elif granularity.endswith("h"):
+            return "%Y-%m-%d %H:00:00"
+        elif granularity.endswith("m"):
+            return "%Y-%m-%d %H:%M:00"
+        else:
+            return "%Y-%m-%d %H:%M:00"  # Default to most granular which will give us the most timestamps.
+
     async def query_traces(
         self,
         attribute_filters: list[QueryCondition] | None = None,
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 7080e774a..7694003b5 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -6,7 +6,7 @@
 
 import asyncio
 import contextvars
-import logging
+import logging  # allow-direct-logging
 import queue
 import random
 import sys
diff --git a/llama_stack/ui/app/chat-playground/page.test.tsx b/llama_stack/ui/app/chat-playground/page.test.tsx
new file mode 100644
index 000000000..54c15f95a
--- /dev/null
+++ b/llama_stack/ui/app/chat-playground/page.test.tsx
@@ -0,0 +1,587 @@
+import React from "react";
+import {
+  render,
+  screen,
+  fireEvent,
+  waitFor,
+  act,
+} from "@testing-library/react";
+import "@testing-library/jest-dom";
+import ChatPlaygroundPage from "./page";
+
+const mockClient = {
+  agents: {
+    list: jest.fn(),
+    create: jest.fn(),
+    retrieve: jest.fn(),
+    delete: jest.fn(),
+    session: {
+      list: jest.fn(),
+      create: jest.fn(),
+      delete: jest.fn(),
+      retrieve: jest.fn(),
+    },
+    turn: {
+      create: jest.fn(),
+    },
+  },
+  models: {
+    list: jest.fn(),
+  },
+  toolgroups: {
+    list: jest.fn(),
+  },
+};
+
+jest.mock("@/hooks/use-auth-client", () => ({
+  useAuthClient: jest.fn(() => mockClient),
+}));
+
+jest.mock("@/components/chat-playground/chat", () => ({
+  Chat: jest.fn(
+    ({
+      className,
+      messages,
+      handleSubmit,
+      input,
+      handleInputChange,
+      isGenerating,
+      append,
+      suggestions,
+    }) => (
+      <div data-testid="chat-component" className={className}>
+        <div data-testid="messages-count">{messages.length}</div>
+        <input
+          data-testid="chat-input"
+          value={input}
+          onChange={handleInputChange}
+          disabled={isGenerating}
+        />
+        <button data-testid="submit-button" onClick={handleSubmit}>
+          Submit
+        </button>
+        {suggestions?.map((suggestion: string, index: number) => (
+          <button
+            key={index}
+            data-testid={`suggestion-${index}`}
+            onClick={() => append({ role: "user", content: suggestion })}
+          >
+            {suggestion}
+          </button>
+        ))}
+      </div>
+    )
+  ),
+}));
+
+jest.mock("@/components/chat-playground/conversations", () => ({
+  SessionManager: jest.fn(({ selectedAgentId, onNewSession }) => (
+    <div data-testid="session-manager">
+      {selectedAgentId && (
+        <>
+          <div data-testid="selected-agent">{selectedAgentId}</div>
+          <button data-testid="new-session-button" onClick={onNewSession}>
+            New Session
+          </button>
+        </>
+      )}
+    </div>
+  )),
+  SessionUtils: {
+    saveCurrentSessionId: jest.fn(),
+    loadCurrentSessionId: jest.fn(),
+    loadCurrentAgentId: jest.fn(),
+    saveCurrentAgentId: jest.fn(),
+    clearCurrentSession: jest.fn(),
+    saveSessionData: jest.fn(),
+    loadSessionData: jest.fn(),
+    saveAgentConfig: jest.fn(),
+    loadAgentConfig: jest.fn(),
+    clearAgentCache: jest.fn(),
+    createDefaultSession: jest.fn(() => ({
+      id: "test-session-123",
+      name: "Default Session",
+      messages: [],
+      selectedModel: "",
+      systemMessage: "You are a helpful assistant.",
+      agentId: "test-agent-123",
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    })),
+  },
+}));
+
+const mockAgents = [
+  {
+    agent_id: "agent_123",
+    agent_config: {
+      name: "Test Agent",
+      instructions: "You are a test assistant.",
+    },
+  },
+  {
+    agent_id: "agent_456",
+    agent_config: {
+      agent_name: "Another Agent",
+      instructions: "You are another assistant.",
+    },
+  },
+];
+
+const mockModels = [
+  {
+    identifier: "test-model-1",
+    model_type: "llm",
+  },
+  {
+    identifier: "test-model-2",
+    model_type: "llm",
+  },
+];
+
+const mockToolgroups = [
+  {
+    identifier: "builtin::rag",
+    provider_id: "test-provider",
+    type: "tool_group",
+    provider_resource_id: "test-resource",
+  },
+];
+
+describe("ChatPlaygroundPage", () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    Element.prototype.scrollIntoView = jest.fn();
+    mockClient.agents.list.mockResolvedValue({ data: mockAgents });
+    mockClient.models.list.mockResolvedValue(mockModels);
+    mockClient.toolgroups.list.mockResolvedValue(mockToolgroups);
+    mockClient.agents.session.create.mockResolvedValue({
+      session_id: "new-session-123",
+    });
+    mockClient.agents.session.list.mockResolvedValue({ data: [] });
+    mockClient.agents.session.retrieve.mockResolvedValue({
+      session_id: "test-session",
+      session_name: "Test Session",
+      started_at: new Date().toISOString(),
+      turns: [],
+    }); // No turns by default
+    mockClient.agents.retrieve.mockResolvedValue({
+      agent_id: "test-agent",
+      agent_config: {
+        toolgroups: ["builtin::rag"],
+        instructions: "Test instructions",
+        model: "test-model",
+      },
+    });
+    mockClient.agents.delete.mockResolvedValue(undefined);
+  });
+
+  describe("Agent Selector Rendering", () => {
+    test("shows agent selector when agents are available", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(screen.getByText("Agent Session:")).toBeInTheDocument();
+        expect(screen.getAllByRole("combobox")).toHaveLength(2);
+        expect(screen.getByText("+ New Agent")).toBeInTheDocument();
+        expect(screen.getByText("Clear Chat")).toBeInTheDocument();
+      });
+    });
+
+    test("does not show agent selector when no agents are available", async () => {
+      mockClient.agents.list.mockResolvedValue({ data: [] });
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(screen.queryByText("Agent Session:")).not.toBeInTheDocument();
+        expect(screen.getAllByRole("combobox")).toHaveLength(1);
+        expect(screen.getByText("+ New Agent")).toBeInTheDocument();
+        expect(screen.queryByText("Clear Chat")).not.toBeInTheDocument();
+      });
+    });
+
+    test("does not show agent selector while loading", async () => {
+      mockClient.agents.list.mockImplementation(() => new Promise(() => {}));
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      expect(screen.queryByText("Agent Session:")).not.toBeInTheDocument();
+      expect(screen.getAllByRole("combobox")).toHaveLength(1);
+      expect(screen.getByText("+ New Agent")).toBeInTheDocument();
+      expect(screen.queryByText("Clear Chat")).not.toBeInTheDocument();
+    });
+
+    test("shows agent options in selector", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        const agentCombobox = screen.getAllByRole("combobox").find(element => {
+          return (
+            element.textContent?.includes("Test Agent") ||
+            element.textContent?.includes("Select Agent")
+          );
+        });
+        expect(agentCombobox).toBeDefined();
+        fireEvent.click(agentCombobox!);
+      });
+
+      await waitFor(() => {
+        expect(screen.getAllByText("Test Agent")).toHaveLength(2);
+        expect(screen.getByText("Another Agent")).toBeInTheDocument();
+      });
+    });
+
+    test("displays agent ID when no name is available", async () => {
+      const agentWithoutName = {
+        agent_id: "agent_789",
+        agent_config: {
+          instructions: "You are an agent without a name.",
+        },
+      };
+
+      mockClient.agents.list.mockResolvedValue({ data: [agentWithoutName] });
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        const agentCombobox = screen.getAllByRole("combobox").find(element => {
+          return (
+            element.textContent?.includes("Agent agent_78") ||
+            element.textContent?.includes("Select Agent")
+          );
+        });
+        expect(agentCombobox).toBeDefined();
+        fireEvent.click(agentCombobox!);
+      });
+
+      await waitFor(() => {
+        expect(screen.getAllByText("Agent agent_78...")).toHaveLength(2);
+      });
+    });
+  });
+
+  describe("Agent Creation Modal", () => {
+    test("opens agent creation modal when + New Agent is clicked", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      const newAgentButton = screen.getByText("+ New Agent");
+      fireEvent.click(newAgentButton);
+
+      expect(screen.getByText("Create New Agent")).toBeInTheDocument();
+      expect(screen.getByText("Agent Name (optional)")).toBeInTheDocument();
+      expect(screen.getAllByText("Model")).toHaveLength(2);
+      expect(screen.getByText("System Instructions")).toBeInTheDocument();
+      expect(screen.getByText("Tools (optional)")).toBeInTheDocument();
+    });
+
+    test("closes modal when Cancel is clicked", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      const newAgentButton = screen.getByText("+ New Agent");
+      fireEvent.click(newAgentButton);
+
+      const cancelButton = screen.getByText("Cancel");
+      fireEvent.click(cancelButton);
+
+      expect(screen.queryByText("Create New Agent")).not.toBeInTheDocument();
+    });
+
+    test("creates agent when Create Agent is clicked", async () => {
+      mockClient.agents.create.mockResolvedValue({ agent_id: "new-agent-123" });
+      mockClient.agents.list
+        .mockResolvedValueOnce({ data: mockAgents })
+        .mockResolvedValueOnce({
+          data: [
+            ...mockAgents,
+            { agent_id: "new-agent-123", agent_config: { name: "New Agent" } },
+          ],
+        });
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      const newAgentButton = screen.getByText("+ New Agent");
+      await act(async () => {
+        fireEvent.click(newAgentButton);
+      });
+
+      await waitFor(() => {
+        expect(screen.getByText("Create New Agent")).toBeInTheDocument();
+      });
+
+      const nameInput = screen.getByPlaceholderText("My Custom Agent");
+      await act(async () => {
+        fireEvent.change(nameInput, { target: { value: "Test Agent Name" } });
+      });
+
+      const instructionsTextarea = screen.getByDisplayValue(
+        "You are a helpful assistant."
+      );
+      await act(async () => {
+        fireEvent.change(instructionsTextarea, {
+          target: { value: "Custom instructions" },
+        });
+      });
+
+      await waitFor(() => {
+        const modalModelSelectors = screen
+          .getAllByRole("combobox")
+          .filter(el => {
+            return (
+              el.textContent?.includes("Select Model") ||
+              el.closest('[class*="modal"]') ||
+              el.closest('[class*="card"]')
+            );
+          });
+        expect(modalModelSelectors.length).toBeGreaterThan(0);
+      });
+
+      const modalModelSelectors = screen.getAllByRole("combobox").filter(el => {
+        return (
+          el.textContent?.includes("Select Model") ||
+          el.closest('[class*="modal"]') ||
+          el.closest('[class*="card"]')
+        );
+      });
+
+      await act(async () => {
+        fireEvent.click(modalModelSelectors[0]);
+      });
+
+      await waitFor(() => {
+        const modelOptions = screen.getAllByText("test-model-1");
+        expect(modelOptions.length).toBeGreaterThan(0);
+      });
+
+      const modelOptions = screen.getAllByText("test-model-1");
+      const dropdownOption = modelOptions.find(
+        option =>
+          option.closest('[role="option"]') ||
+          option.id?.includes("radix") ||
+          option.getAttribute("aria-selected") !== null
+      );
+
+      await act(async () => {
+        fireEvent.click(
+          dropdownOption || modelOptions[modelOptions.length - 1]
+        );
+      });
+
+      await waitFor(() => {
+        const createButton = screen.getByText("Create Agent");
+        expect(createButton).not.toBeDisabled();
+      });
+
+      const createButton = screen.getByText("Create Agent");
+      await act(async () => {
+        fireEvent.click(createButton);
+      });
+
+      await waitFor(() => {
+        expect(mockClient.agents.create).toHaveBeenCalledWith({
+          agent_config: {
+            model: expect.any(String),
+            instructions: "Custom instructions",
+            name: "Test Agent Name",
+            enable_session_persistence: true,
+          },
+        });
+      });
+
+      await waitFor(() => {
+        expect(screen.queryByText("Create New Agent")).not.toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("Agent Selection", () => {
+    test("creates default session when agent is selected", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        // first agent should be auto-selected
+        expect(mockClient.agents.session.create).toHaveBeenCalledWith(
+          "agent_123",
+          { session_name: "Default Session" }
+        );
+      });
+    });
+
+    test("switches agent when different agent is selected", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        const agentCombobox = screen.getAllByRole("combobox").find(element => {
+          return (
+            element.textContent?.includes("Test Agent") ||
+            element.textContent?.includes("Select Agent")
+          );
+        });
+        expect(agentCombobox).toBeDefined();
+        fireEvent.click(agentCombobox!);
+      });
+
+      await waitFor(() => {
+        const anotherAgentOption = screen.getByText("Another Agent");
+        fireEvent.click(anotherAgentOption);
+      });
+
+      expect(mockClient.agents.session.create).toHaveBeenCalledWith(
+        "agent_456",
+        { session_name: "Default Session" }
+      );
+    });
+  });
+
+  describe("Agent Deletion", () => {
+    test("shows delete button when multiple agents exist", async () => {
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(screen.getByTitle("Delete current agent")).toBeInTheDocument();
+      });
+    });
+
+    test("hides delete button when only one agent exists", async () => {
+      mockClient.agents.list.mockResolvedValue({
+        data: [mockAgents[0]],
+      });
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(
+          screen.queryByTitle("Delete current agent")
+        ).not.toBeInTheDocument();
+      });
+    });
+
+    test("deletes agent and switches to another when confirmed", async () => {
+      global.confirm = jest.fn(() => true);
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(screen.getByTitle("Delete current agent")).toBeInTheDocument();
+      });
+
+      mockClient.agents.delete.mockResolvedValue(undefined);
+      mockClient.agents.list.mockResolvedValueOnce({ data: mockAgents });
+      mockClient.agents.list.mockResolvedValueOnce({
+        data: [mockAgents[1]],
+      });
+
+      const deleteButton = screen.getByTitle("Delete current agent");
+      await act(async () => {
+        deleteButton.click();
+      });
+
+      await waitFor(() => {
+        expect(mockClient.agents.delete).toHaveBeenCalledWith("agent_123");
+        expect(global.confirm).toHaveBeenCalledWith(
+          "Are you sure you want to delete this agent? This action cannot be undone and will delete all associated sessions."
+        );
+      });
+
+      (global.confirm as jest.Mock).mockRestore();
+    });
+
+    test("does not delete agent when cancelled", async () => {
+      global.confirm = jest.fn(() => false);
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(screen.getByTitle("Delete current agent")).toBeInTheDocument();
+      });
+
+      const deleteButton = screen.getByTitle("Delete current agent");
+      await act(async () => {
+        deleteButton.click();
+      });
+
+      await waitFor(() => {
+        expect(global.confirm).toHaveBeenCalled();
+        expect(mockClient.agents.delete).not.toHaveBeenCalled();
+      });
+
+      (global.confirm as jest.Mock).mockRestore();
+    });
+  });
+
+  describe("Error Handling", () => {
+    test("handles agent loading errors gracefully", async () => {
+      mockClient.agents.list.mockRejectedValue(
+        new Error("Failed to load agents")
+      );
+      const consoleSpy = jest
+        .spyOn(console, "error")
+        .mockImplementation(() => {});
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(consoleSpy).toHaveBeenCalledWith(
+          "Error fetching agents:",
+          expect.any(Error)
+        );
+      });
+
+      expect(screen.getByText("+ New Agent")).toBeInTheDocument();
+
+      consoleSpy.mockRestore();
+    });
+
+    test("handles model loading errors gracefully", async () => {
+      mockClient.models.list.mockRejectedValue(
+        new Error("Failed to load models")
+      );
+      const consoleSpy = jest
+        .spyOn(console, "error")
+        .mockImplementation(() => {});
+
+      await act(async () => {
+        render(<ChatPlaygroundPage />);
+      });
+
+      await waitFor(() => {
+        expect(consoleSpy).toHaveBeenCalledWith(
+          "Error fetching models:",
+          expect.any(Error)
+        );
+      });
+
+      consoleSpy.mockRestore();
+    });
+  });
+});
diff --git a/llama_stack/ui/app/chat-playground/page.tsx b/llama_stack/ui/app/chat-playground/page.tsx
index b8651aca0..f26791a41 100644
--- a/llama_stack/ui/app/chat-playground/page.tsx
+++ b/llama_stack/ui/app/chat-playground/page.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { useState, useEffect } from "react";
+import { useState, useEffect, useCallback, useRef } from "react";
 import { flushSync } from "react-dom";
 import { Button } from "@/components/ui/button";
 import {
@@ -10,14 +10,22 @@ import {
   SelectTrigger,
   SelectValue,
 } from "@/components/ui/select";
+import { Card } from "@/components/ui/card";
+import { Input } from "@/components/ui/input";
+import { Trash2 } from "lucide-react";
 import { Chat } from "@/components/chat-playground/chat";
 import { type Message } from "@/components/chat-playground/chat-message";
 import { useAuthClient } from "@/hooks/use-auth-client";
-import type { CompletionCreateParams } from "llama-stack-client/resources/chat/completions";
 import type { Model } from "llama-stack-client/resources/models";
-
+import type { TurnCreateParams } from "llama-stack-client/resources/agents/turn";
+import {
+  SessionUtils,
+  type ChatSession,
+} from "@/components/chat-playground/conversations";
 export default function ChatPlaygroundPage() {
-  const [messages, setMessages] = useState<Message[]>([]);
+  const [currentSession, setCurrentSession] = useState<ChatSession | null>(
+    null
+  );
   const [input, setInput] = useState("");
   const [isGenerating, setIsGenerating] = useState(false);
   const [error, setError] = useState<string | null>(null);
@@ -25,10 +33,523 @@ export default function ChatPlaygroundPage() {
   const [selectedModel, setSelectedModel] = useState<string>("");
   const [modelsLoading, setModelsLoading] = useState(true);
   const [modelsError, setModelsError] = useState<string | null>(null);
+  const [agents, setAgents] = useState<
+    Array<{
+      agent_id: string;
+      agent_config?: {
+        agent_name?: string;
+        name?: string;
+        instructions?: string;
+      };
+      [key: string]: unknown;
+    }>
+  >([]);
+  const [selectedAgentConfig, setSelectedAgentConfig] = useState<{
+    toolgroups?: Array<
+      string | { name: string; args: Record<string, unknown> }
+    >;
+  } | null>(null);
+  const [selectedAgentId, setSelectedAgentId] = useState<string>("");
+  const [agentsLoading, setAgentsLoading] = useState(true);
+  const [showCreateAgent, setShowCreateAgent] = useState(false);
+  const [newAgentName, setNewAgentName] = useState("");
+  const [newAgentInstructions, setNewAgentInstructions] = useState(
+    "You are a helpful assistant."
+  );
+  const [selectedToolgroups, setSelectedToolgroups] = useState<string[]>([]);
+  const [availableToolgroups, setAvailableToolgroups] = useState<
+    Array<{
+      identifier: string;
+      provider_id: string;
+      type: string;
+      provider_resource_id?: string;
+    }>
+  >([]);
   const client = useAuthClient();
+  const abortControllerRef = useRef<AbortController | null>(null);
 
   const isModelsLoading = modelsLoading ?? true;
 
+  const loadAgentConfig = useCallback(
+    async (agentId: string) => {
+      try {
+        console.log("Loading agent config for:", agentId);
+
+        // try to load from cache first
+        const cachedConfig = SessionUtils.loadAgentConfig(agentId);
+        if (cachedConfig) {
+          console.log("✅ Loaded agent config from cache:", cachedConfig);
+          setSelectedAgentConfig({
+            toolgroups: cachedConfig.toolgroups,
+          });
+          return;
+        }
+
+        console.log("📡 Fetching agent config from API...");
+        const agentDetails = await client.agents.retrieve(agentId);
+        console.log("Agent details retrieved:", agentDetails);
+        console.log("Agent config:", agentDetails.agent_config);
+        console.log("Agent toolgroups:", agentDetails.agent_config?.toolgroups);
+
+        // cache the config
+        SessionUtils.saveAgentConfig(agentId, agentDetails.agent_config);
+
+        setSelectedAgentConfig({
+          toolgroups: agentDetails.agent_config?.toolgroups,
+        });
+      } catch (error) {
+        console.error("Error loading agent config:", error);
+        setSelectedAgentConfig(null);
+      }
+    },
+    [client]
+  );
+
+  const createDefaultSession = useCallback(
+    async (agentId: string) => {
+      try {
+        const response = await client.agents.session.create(agentId, {
+          session_name: "Default Session",
+        });
+
+        const defaultSession: ChatSession = {
+          id: response.session_id,
+          name: "Default Session",
+          messages: [],
+          selectedModel: selectedModel, // Use current selected model
+          systemMessage: "You are a helpful assistant.",
+          agentId,
+          createdAt: Date.now(),
+          updatedAt: Date.now(),
+        };
+
+        setCurrentSession(defaultSession);
+        console.log(
+          `💾 Saving default session ID for agent ${agentId}:`,
+          defaultSession.id
+        );
+        SessionUtils.saveCurrentSessionId(defaultSession.id, agentId);
+        // cache entire session data
+        SessionUtils.saveSessionData(agentId, defaultSession);
+      } catch (error) {
+        console.error("Error creating default session:", error);
+      }
+    },
+    [client, selectedModel]
+  );
+
+  const loadSessionMessages = useCallback(
+    async (agentId: string, sessionId: string): Promise<Message[]> => {
+      try {
+        const session = await client.agents.session.retrieve(
+          agentId,
+          sessionId
+        );
+
+        if (!session || !session.turns || !Array.isArray(session.turns)) {
+          return [];
+        }
+
+        const messages: Message[] = [];
+        for (const turn of session.turns) {
+          // add user messages
+          if (turn.input_messages && Array.isArray(turn.input_messages)) {
+            for (const input of turn.input_messages) {
+              if (input.role === "user" && input.content) {
+                messages.push({
+                  id: `${turn.turn_id}-user-${messages.length}`,
+                  role: "user",
+                  content:
+                    typeof input.content === "string"
+                      ? input.content
+                      : JSON.stringify(input.content),
+                  createdAt: new Date(turn.started_at || Date.now()),
+                });
+              }
+            }
+          }
+
+          // add assistant message from output_message
+          if (turn.output_message && turn.output_message.content) {
+            messages.push({
+              id: `${turn.turn_id}-assistant-${messages.length}`,
+              role: "assistant",
+              content:
+                typeof turn.output_message.content === "string"
+                  ? turn.output_message.content
+                  : JSON.stringify(turn.output_message.content),
+              createdAt: new Date(
+                turn.completed_at || turn.started_at || Date.now()
+              ),
+            });
+          }
+        }
+
+        return messages;
+      } catch (error) {
+        console.error("Error loading session messages:", error);
+        return [];
+      }
+    },
+    [client]
+  );
+
+  const loadAgentSessions = useCallback(
+    async (agentId: string) => {
+      try {
+        console.log("Loading sessions for agent:", agentId);
+        const response = await client.agents.session.list(agentId);
+        console.log("Available sessions:", response.data);
+
+        if (
+          response.data &&
+          Array.isArray(response.data) &&
+          response.data.length > 0
+        ) {
+          // check for a previously saved session ID for this specific agent
+          const savedSessionId = SessionUtils.loadCurrentSessionId(agentId);
+          console.log(`Saved session ID for agent ${agentId}:`, savedSessionId);
+
+          // try to load cached session data first
+          if (savedSessionId) {
+            const cachedSession = SessionUtils.loadSessionData(
+              agentId,
+              savedSessionId
+            );
+            if (cachedSession) {
+              console.log("✅ Loaded session from cache:", cachedSession.id);
+              setCurrentSession(cachedSession);
+              SessionUtils.saveCurrentSessionId(cachedSession.id, agentId);
+              return;
+            }
+            console.log("📡 Cache miss, fetching session from API...");
+          }
+
+          let sessionToLoad = response.data[0] as {
+            session_id: string;
+            session_name?: string;
+            started_at?: string;
+          };
+          console.log(
+            "Default session to load (first in list):",
+            sessionToLoad.session_id
+          );
+
+          // try to find saved session id in available sessions
+          if (savedSessionId) {
+            const foundSession = response.data.find(
+              (s: { session_id: string }) => s.session_id === savedSessionId
+            );
+            console.log("Found saved session in list:", foundSession);
+            if (foundSession) {
+              sessionToLoad = foundSession as {
+                session_id: string;
+                session_name?: string;
+                started_at?: string;
+              };
+              console.log(
+                "✅ Restored previously selected session:",
+                savedSessionId
+              );
+            } else {
+              console.log(
+                "❌ Previously selected session not found, using latest session"
+              );
+            }
+          } else {
+            console.log("❌ No saved session ID found, using latest session");
+          }
+
+          const messages = await loadSessionMessages(
+            agentId,
+            sessionToLoad.session_id
+          );
+
+          const session: ChatSession = {
+            id: sessionToLoad.session_id,
+            name: sessionToLoad.session_name || "Session",
+            messages,
+            selectedModel: selectedModel || "", // Preserve current model or use empty
+            systemMessage: "You are a helpful assistant.",
+            agentId,
+            createdAt: sessionToLoad.started_at
+              ? new Date(sessionToLoad.started_at).getTime()
+              : Date.now(),
+            updatedAt: Date.now(),
+          };
+
+          setCurrentSession(session);
+          console.log(`💾 Saving session ID for agent ${agentId}:`, session.id);
+          SessionUtils.saveCurrentSessionId(session.id, agentId);
+          // cache session data
+          SessionUtils.saveSessionData(agentId, session);
+        } else {
+          // no sessions, create a new one
+          await createDefaultSession(agentId);
+        }
+      } catch (error) {
+        console.error("Error loading agent sessions:", error);
+        // fallback to creating a new session
+        await createDefaultSession(agentId);
+      }
+    },
+    [client, loadSessionMessages, createDefaultSession, selectedModel]
+  );
+
+  useEffect(() => {
+    const fetchAgents = async () => {
+      try {
+        setAgentsLoading(true);
+        const agentList = await client.agents.list();
+        setAgents(
+          (agentList.data as Array<{
+            agent_id: string;
+            agent_config?: {
+              agent_name?: string;
+              name?: string;
+              instructions?: string;
+            };
+            [key: string]: unknown;
+          }>) || []
+        );
+
+        if (agentList.data && agentList.data.length > 0) {
+          // check if there's a previously selected agent
+          const savedAgentId = SessionUtils.loadCurrentAgentId();
+
+          let agentToSelect = agentList.data[0] as {
+            agent_id: string;
+            agent_config?: {
+              agent_name?: string;
+              name?: string;
+              instructions?: string;
+            };
+            [key: string]: unknown;
+          };
+
+          // if we have a saved agent ID, find it in the available agents
+          if (savedAgentId) {
+            const foundAgent = agentList.data.find(
+              (a: { agent_id: string }) => a.agent_id === savedAgentId
+            );
+            if (foundAgent) {
+              agentToSelect = foundAgent as typeof agentToSelect;
+            } else {
+              console.log("Previously slelected agent not found:");
+            }
+          }
+          setSelectedAgentId(agentToSelect.agent_id);
+          SessionUtils.saveCurrentAgentId(agentToSelect.agent_id);
+          // load agent config immediately
+          await loadAgentConfig(agentToSelect.agent_id);
+          // Note: loadAgentSessions will be called after models are loaded
+        }
+      } catch (error) {
+        console.error("Error fetching agents:", error);
+      } finally {
+        setAgentsLoading(false);
+      }
+    };
+
+    fetchAgents();
+
+    // fetch available toolgroups
+    const fetchToolgroups = async () => {
+      try {
+        console.log("Fetching toolgroups...");
+        const toolgroups = await client.toolgroups.list();
+        console.log("Toolgroups response:", toolgroups);
+
+        // The client returns data directly, not wrapped in .data
+        const toolGroupsArray = Array.isArray(toolgroups)
+          ? toolgroups
+          : toolgroups &&
+              typeof toolgroups === "object" &&
+              "data" in toolgroups &&
+              Array.isArray((toolgroups as { data: unknown }).data)
+            ? (
+                toolgroups as {
+                  data: Array<{
+                    identifier: string;
+                    provider_id: string;
+                    type: string;
+                    provider_resource_id?: string;
+                  }>;
+                }
+              ).data
+            : [];
+
+        if (toolGroupsArray && Array.isArray(toolGroupsArray)) {
+          setAvailableToolgroups(toolGroupsArray);
+          console.log("Set toolgroups:", toolGroupsArray);
+        } else {
+          console.error("Invalid toolgroups data format:", toolgroups);
+        }
+      } catch (error) {
+        console.error("Error fetching toolgroups:", error);
+        if (error instanceof Error) {
+          console.error("Error details:", {
+            name: error.name,
+            message: error.message,
+            stack: error.stack,
+          });
+        }
+      }
+    };
+
+    fetchToolgroups();
+  }, [client, loadAgentSessions, loadAgentConfig]);
+
+  const createNewAgent = useCallback(
+    async (
+      name: string,
+      instructions: string,
+      model: string,
+      toolgroups: string[] = []
+    ) => {
+      try {
+        console.log("Creating agent with toolgroups:", toolgroups);
+        const agentConfig = {
+          model,
+          instructions,
+          name: name || undefined,
+          enable_session_persistence: true,
+          toolgroups: toolgroups.length > 0 ? toolgroups : undefined,
+        };
+        console.log("Agent config being sent:", agentConfig);
+
+        const response = await client.agents.create({
+          agent_config: agentConfig,
+        });
+
+        // refresh agents list
+        const agentList = await client.agents.list();
+        setAgents(
+          (agentList.data as Array<{
+            agent_id: string;
+            agent_config?: {
+              agent_name?: string;
+              name?: string;
+              instructions?: string;
+            };
+            [key: string]: unknown;
+          }>) || []
+        );
+
+        // set the new agent as selected
+        setSelectedAgentId(response.agent_id);
+        await loadAgentConfig(response.agent_id);
+        await loadAgentSessions(response.agent_id);
+
+        return response.agent_id;
+      } catch (error) {
+        console.error("Error creating agent:", error);
+        throw error;
+      }
+    },
+    [client, loadAgentSessions, loadAgentConfig]
+  );
+
+  const deleteAgent = useCallback(
+    async (agentId: string) => {
+      if (agents.length <= 1) {
+        return;
+      }
+
+      if (
+        confirm(
+          "Are you sure you want to delete this agent? This action cannot be undone and will delete all associated sessions."
+        )
+      ) {
+        try {
+          await client.agents.delete(agentId);
+
+          // clear cached data for agent
+          SessionUtils.clearAgentCache(agentId);
+
+          // Refresh agents list
+          const agentList = await client.agents.list();
+          setAgents(
+            (agentList.data as Array<{
+              agent_id: string;
+              agent_config?: {
+                agent_name?: string;
+                name?: string;
+                instructions?: string;
+              };
+              [key: string]: unknown;
+            }>) || []
+          );
+
+          // if we deleted the current agent, switch to another one
+          if (selectedAgentId === agentId) {
+            const remainingAgents = agentList.data?.filter(
+              (a: { agent_id: string }) => a.agent_id !== agentId
+            );
+            if (remainingAgents && remainingAgents.length > 0) {
+              const newAgent = remainingAgents[0] as {
+                agent_id: string;
+                agent_config?: {
+                  agent_name?: string;
+                  name?: string;
+                  instructions?: string;
+                };
+                [key: string]: unknown;
+              };
+              setSelectedAgentId(newAgent.agent_id);
+              SessionUtils.saveCurrentAgentId(newAgent.agent_id);
+              await loadAgentConfig(newAgent.agent_id);
+              await loadAgentSessions(newAgent.agent_id);
+            } else {
+              // No agents left
+              setSelectedAgentId("");
+              setCurrentSession(null);
+              setSelectedAgentConfig(null);
+            }
+          }
+        } catch (error) {
+          console.error("Error deleting agent:", error);
+        }
+      }
+    },
+    [agents.length, client, selectedAgentId, loadAgentConfig, loadAgentSessions]
+  );
+
+  const handleModelChange = useCallback((newModel: string) => {
+    setSelectedModel(newModel);
+    setCurrentSession(prev =>
+      prev
+        ? {
+            ...prev,
+            selectedModel: newModel,
+            updatedAt: Date.now(),
+          }
+        : prev
+    );
+  }, []);
+
+  useEffect(() => {
+    if (currentSession) {
+      console.log(
+        `💾 Auto-saving session ID for agent ${currentSession.agentId}:`,
+        currentSession.id
+      );
+      SessionUtils.saveCurrentSessionId(
+        currentSession.id,
+        currentSession.agentId
+      );
+      // cache session data
+      SessionUtils.saveSessionData(currentSession.agentId, currentSession);
+      // only update selectedModel if the session has a valid model and it's different from current
+      if (
+        currentSession.selectedModel &&
+        currentSession.selectedModel !== selectedModel
+      ) {
+        setSelectedModel(currentSession.selectedModel);
+      }
+    }
+  }, [currentSession, selectedModel]);
+
   useEffect(() => {
     const fetchModels = async () => {
       try {
@@ -38,7 +559,7 @@ export default function ChatPlaygroundPage() {
         const llmModels = modelList.filter(model => model.model_type === "llm");
         setModels(llmModels);
         if (llmModels.length > 0) {
-          setSelectedModel(llmModels[0].identifier);
+          handleModelChange(llmModels[0].identifier);
         }
       } catch (err) {
         console.error("Error fetching models:", err);
@@ -49,39 +570,27 @@ export default function ChatPlaygroundPage() {
     };
 
     fetchModels();
-  }, [client]);
+  }, [client, handleModelChange]);
 
-  const extractTextContent = (content: unknown): string => {
-    if (typeof content === "string") {
-      return content;
-    }
-    if (Array.isArray(content)) {
-      return content
-        .filter(
-          item =>
-            item &&
-            typeof item === "object" &&
-            "type" in item &&
-            item.type === "text"
-        )
-        .map(item =>
-          item && typeof item === "object" && "text" in item
-            ? String(item.text)
-            : ""
-        )
-        .join("");
-    }
+  // load agent sessions after both agents and models are ready
+  useEffect(() => {
     if (
-      content &&
-      typeof content === "object" &&
-      "type" in content &&
-      content.type === "text" &&
-      "text" in content
+      selectedAgentId &&
+      !agentsLoading &&
+      !modelsLoading &&
+      selectedModel &&
+      !currentSession
     ) {
-      return String(content.text) || "";
+      loadAgentSessions(selectedAgentId);
     }
-    return "";
-  };
+  }, [
+    selectedAgentId,
+    agentsLoading,
+    modelsLoading,
+    selectedModel,
+    currentSession,
+    loadAgentSessions,
+  ]);
 
   const handleInputChange = (e: React.ChangeEvent<HTMLTextAreaElement>) => {
     setInput(e.target.value);
@@ -91,7 +600,6 @@ export default function ChatPlaygroundPage() {
     event?.preventDefault?.();
     if (!input.trim()) return;
 
-    // Add user message to chat
     const userMessage: Message = {
       id: Date.now().toString(),
       role: "user",
@@ -99,40 +607,54 @@ export default function ChatPlaygroundPage() {
       createdAt: new Date(),
     };
 
-    setMessages(prev => [...prev, userMessage]);
+    setCurrentSession(prev => {
+      if (!prev) return prev;
+      const updatedSession = {
+        ...prev,
+        messages: [...prev.messages, userMessage],
+        updatedAt: Date.now(),
+      };
+      // Update cache with new message
+      SessionUtils.saveSessionData(prev.agentId, updatedSession);
+      return updatedSession;
+    });
     setInput("");
 
-    // Use the helper function with the content
     await handleSubmitWithContent(userMessage.content);
   };
 
   const handleSubmitWithContent = async (content: string) => {
+    if (!currentSession || !selectedAgentId) return;
+
     setIsGenerating(true);
     setError(null);
 
-    try {
-      const messageParams: CompletionCreateParams["messages"] = [
-        ...messages.map(msg => {
-          const msgContent =
-            typeof msg.content === "string"
-              ? msg.content
-              : extractTextContent(msg.content);
-          if (msg.role === "user") {
-            return { role: "user" as const, content: msgContent };
-          } else if (msg.role === "assistant") {
-            return { role: "assistant" as const, content: msgContent };
-          } else {
-            return { role: "system" as const, content: msgContent };
-          }
-        }),
-        { role: "user" as const, content },
-      ];
+    if (abortControllerRef.current) {
+      abortControllerRef.current.abort();
+    }
 
-      const response = await client.chat.completions.create({
-        model: selectedModel,
-        messages: messageParams,
+    const abortController = new AbortController();
+    abortControllerRef.current = abortController;
+
+    try {
+      const userMessage = {
+        role: "user" as const,
+        content,
+      };
+
+      const turnParams: TurnCreateParams = {
+        messages: [userMessage],
         stream: true,
-      });
+      };
+
+      const response = await client.agents.turn.create(
+        selectedAgentId,
+        currentSession.id,
+        turnParams,
+        {
+          signal: abortController.signal,
+        } as { signal: AbortSignal }
+      );
 
       const assistantMessage: Message = {
         id: (Date.now() + 1).toString(),
@@ -141,31 +663,112 @@ export default function ChatPlaygroundPage() {
         createdAt: new Date(),
       };
 
-      setMessages(prev => [...prev, assistantMessage]);
+      const extractDeltaText = (chunk: unknown): string | null => {
+        // this is an awful way to handle different chunk formats, but i'm not sure if there's much of a better way
+        if (chunk?.delta?.text && typeof chunk.delta.text === "string") {
+          return chunk.delta.text;
+        }
+
+        if (
+          chunk?.event?.delta?.text &&
+          typeof chunk.event.delta.text === "string"
+        ) {
+          return chunk.event.delta.text;
+        }
+
+        if (
+          chunk?.choices?.[0]?.delta?.content &&
+          typeof chunk.choices[0].delta.content === "string"
+        ) {
+          return chunk.choices[0].delta.content;
+        }
+
+        if (typeof chunk === "string") {
+          return chunk;
+        }
+
+        if (
+          chunk?.event?.payload?.delta?.text &&
+          typeof chunk.event.payload.delta.text === "string"
+        ) {
+          return chunk.event.payload.delta.text;
+        }
+
+        if (process.env.NODE_ENV !== "production") {
+          console.debug("Unrecognized chunk format:", chunk);
+        }
+
+        return null;
+      };
+      setCurrentSession(prev => {
+        if (!prev) return null;
+        const updatedSession = {
+          ...prev,
+          messages: [...prev.messages, assistantMessage],
+          updatedAt: Date.now(),
+        };
+        // update cache with assistant message
+        SessionUtils.saveSessionData(prev.agentId, updatedSession);
+        return updatedSession;
+      });
+
       let fullContent = "";
       for await (const chunk of response) {
-        if (chunk.choices && chunk.choices[0]?.delta?.content) {
-          const deltaContent = chunk.choices[0].delta.content;
-          fullContent += deltaContent;
+        const deltaText = extractDeltaText(chunk);
+
+        if (deltaText) {
+          fullContent += deltaText;
 
           flushSync(() => {
-            setMessages(prev => {
-              const newMessages = [...prev];
-              const lastMessage = newMessages[newMessages.length - 1];
-              if (lastMessage.role === "assistant") {
-                lastMessage.content = fullContent;
+            setCurrentSession(prev => {
+              if (!prev) return null;
+              const newMessages = [...prev.messages];
+              const last = newMessages[newMessages.length - 1];
+              if (last.role === "assistant") {
+                last.content = fullContent;
               }
-              return newMessages;
+              const updatedSession = {
+                ...prev,
+                messages: newMessages,
+                updatedAt: Date.now(),
+              };
+              // update cache with streaming content (throttled)
+              if (fullContent.length % 100 === 0) {
+                // Only cache every 100 characters to avoid spam
+                SessionUtils.saveSessionData(prev.agentId, updatedSession);
+              }
+              return updatedSession;
             });
           });
         }
       }
     } catch (err) {
+      if (err instanceof Error && err.name === "AbortError") {
+        console.log("Request aborted");
+        return;
+      }
+
       console.error("Error sending message:", err);
       setError("Failed to send message. Please try again.");
-      setMessages(prev => prev.slice(0, -1));
+      setCurrentSession(prev =>
+        prev
+          ? {
+              ...prev,
+              messages: prev.messages.slice(0, -1),
+              updatedAt: Date.now(),
+            }
+          : prev
+      );
     } finally {
       setIsGenerating(false);
+      abortControllerRef.current = null;
+      // cache final session state after streaming completes
+      setCurrentSession(prev => {
+        if (prev) {
+          SessionUtils.saveSessionData(prev.agentId, prev);
+        }
+        return prev;
+      });
     }
   };
   const suggestions = [
@@ -181,69 +784,457 @@ export default function ChatPlaygroundPage() {
       content: message.content,
       createdAt: new Date(),
     };
-    setMessages(prev => [...prev, newMessage]);
+    setCurrentSession(prev =>
+      prev
+        ? {
+            ...prev,
+            messages: [...prev.messages, newMessage],
+            updatedAt: Date.now(),
+          }
+        : prev
+    );
     handleSubmitWithContent(newMessage.content);
   };
 
   const clearChat = () => {
-    setMessages([]);
+    if (abortControllerRef.current) {
+      abortControllerRef.current.abort();
+      abortControllerRef.current = null;
+      setIsGenerating(false);
+    }
+
+    setCurrentSession(prev =>
+      prev ? { ...prev, messages: [], updatedAt: Date.now() } : prev
+    );
     setError(null);
   };
 
   return (
-    <div className="flex flex-col h-full max-w-4xl mx-auto">
-      <div className="mb-4 flex justify-between items-center">
-        <h1 className="text-2xl font-bold">Chat Playground (Completions)</h1>
-        <div className="flex gap-2">
-          <Select
-            value={selectedModel}
-            onValueChange={setSelectedModel}
-            disabled={isModelsLoading || isGenerating}
-          >
-            <SelectTrigger className="w-[180px]">
-              <SelectValue
-                placeholder={
-                  isModelsLoading ? "Loading models..." : "Select Model"
-                }
-              />
-            </SelectTrigger>
-            <SelectContent>
-              {models.map(model => (
-                <SelectItem key={model.identifier} value={model.identifier}>
-                  {model.identifier}
-                </SelectItem>
-              ))}
-            </SelectContent>
-          </Select>
-          <Button variant="outline" onClick={clearChat} disabled={isGenerating}>
-            Clear Chat
-          </Button>
+    <div className="flex flex-col h-full w-full max-w-7xl mx-auto">
+      {/* Header */}
+      <div className="mb-6">
+        <div className="flex justify-between items-center mb-4">
+          <h1 className="text-3xl font-bold">Agent Session</h1>
+          <div className="flex items-center gap-3">
+            {!agentsLoading && agents.length > 0 && (
+              <div className="flex items-center gap-2">
+                <label className="text-sm font-medium">Agent Session:</label>
+                <Select
+                  value={selectedAgentId}
+                  onValueChange={agentId => {
+                    console.log("🤖 User selected agent:", agentId);
+                    setSelectedAgentId(agentId);
+                    SessionUtils.saveCurrentAgentId(agentId);
+                    loadAgentConfig(agentId);
+                    loadAgentSessions(agentId);
+                  }}
+                  disabled={agentsLoading}
+                >
+                  <SelectTrigger className="w-[200px]">
+                    <SelectValue
+                      placeholder={
+                        agentsLoading ? "Loading..." : "Select Agent Session"
+                      }
+                    />
+                  </SelectTrigger>
+                  <SelectContent>
+                    {agents.map(agent => (
+                      <SelectItem key={agent.agent_id} value={agent.agent_id}>
+                        {(() => {
+                          if (
+                            agent.agent_config &&
+                            "name" in agent.agent_config &&
+                            typeof agent.agent_config.name === "string"
+                          ) {
+                            return agent.agent_config.name;
+                          }
+                          if (
+                            agent.agent_config &&
+                            "agent_name" in agent.agent_config &&
+                            typeof agent.agent_config.agent_name === "string"
+                          ) {
+                            return agent.agent_config.agent_name;
+                          }
+                          return `Agent ${agent.agent_id.slice(0, 8)}...`;
+                        })()}
+                      </SelectItem>
+                    ))}
+                  </SelectContent>
+                </Select>
+                {selectedAgentId && agents.length > 1 && (
+                  <Button
+                    onClick={() => deleteAgent(selectedAgentId)}
+                    variant="outline"
+                    size="sm"
+                    className="text-destructive hover:text-destructive hover:bg-destructive/10"
+                    title="Delete current agent"
+                  >
+                    <Trash2 className="h-3 w-3" />
+                  </Button>
+                )}
+              </div>
+            )}
+            <Button
+              onClick={() => setShowCreateAgent(true)}
+              variant="outline"
+              size="sm"
+            >
+              + New Agent
+            </Button>
+            {!agentsLoading && agents.length > 0 && (
+              <Button
+                variant="outline"
+                onClick={clearChat}
+                disabled={isGenerating}
+              >
+                Clear Chat
+              </Button>
+            )}
+          </div>
+        </div>
+      </div>
+      {/* Main Two-Column Layout */}
+      <div className="flex flex-1 gap-6 min-h-0 flex-col lg:flex-row">
+        {/* Left Column - Configuration Panel */}
+        <div className="w-full lg:w-80 lg:flex-shrink-0 space-y-6 p-4 border border-border rounded-lg bg-muted/30">
+          <h2 className="text-lg font-semibold border-b pb-2 text-left">
+            Settings
+          </h2>
+
+          {/* Model Configuration */}
+          <div className="space-y-4 text-left">
+            <h3 className="text-lg font-semibold border-b pb-2 text-left">
+              Model Configuration
+            </h3>
+            <div className="space-y-3">
+              <div>
+                <label className="text-sm font-medium block mb-2">Model</label>
+                <Select
+                  value={selectedModel}
+                  onValueChange={handleModelChange}
+                  disabled={isModelsLoading || isGenerating}
+                >
+                  <SelectTrigger className="w-full">
+                    <SelectValue
+                      placeholder={
+                        isModelsLoading ? "Loading..." : "Select Model"
+                      }
+                    />
+                  </SelectTrigger>
+                  <SelectContent>
+                    {models.map(model => (
+                      <SelectItem
+                        key={model.identifier}
+                        value={model.identifier}
+                      >
+                        {model.identifier}
+                      </SelectItem>
+                    ))}
+                  </SelectContent>
+                </Select>
+                {modelsError && (
+                  <p className="text-destructive text-xs mt-1">{modelsError}</p>
+                )}
+              </div>
+
+              <div>
+                <label className="text-sm font-medium block mb-2">
+                  Agent Instructions
+                </label>
+                <div className="w-full h-24 px-3 py-2 text-sm border border-input rounded-md bg-muted text-muted-foreground">
+                  {(selectedAgentId &&
+                    agents.find(a => a.agent_id === selectedAgentId)
+                      ?.agent_config?.instructions) ||
+                    "No agent selected"}
+                </div>
+                <p className="text-xs text-muted-foreground mt-1">
+                  Instructions are set when creating an agent and cannot be
+                  changed.
+                </p>
+              </div>
+            </div>
+          </div>
+
+          {/* Agent Tools */}
+          <div className="space-y-4 text-left">
+            <h3 className="text-lg font-semibold border-b pb-2 text-left">
+              Agent Tools
+            </h3>
+            <div className="space-y-3">
+              <div>
+                <label className="text-sm font-medium block mb-2 text-muted-foreground">
+                  Configured Tools (Coming Soon)
+                </label>
+                <div className="space-y-2">
+                  {selectedAgentConfig?.toolgroups &&
+                  selectedAgentConfig.toolgroups.length > 0 ? (
+                    selectedAgentConfig.toolgroups.map(
+                      (
+                        toolgroup:
+                          | string
+                          | { name: string; args: Record<string, unknown> },
+                        index: number
+                      ) => {
+                        const toolName =
+                          typeof toolgroup === "string"
+                            ? toolgroup
+                            : toolgroup.name;
+                        const toolArgs =
+                          typeof toolgroup === "object" ? toolgroup.args : null;
+
+                        return (
+                          <div
+                            key={index}
+                            className="p-3 border border-input rounded-md bg-muted text-muted-foreground"
+                          >
+                            <div className="flex items-center justify-between">
+                              <code className="text-sm font-mono text-primary">
+                                {toolName}
+                              </code>
+                              <span className="text-xs text-muted-foreground">
+                                {toolName.includes("rag")
+                                  ? "🔍 RAG"
+                                  : toolName.includes("search")
+                                    ? "🌐 Search"
+                                    : "🔧 Tool"}
+                              </span>
+                            </div>
+                            {toolArgs && Object.keys(toolArgs).length > 0 && (
+                              <div className="mt-2 text-xs text-muted-foreground">
+                                <span className="font-medium">Args:</span>{" "}
+                                {Object.entries(toolArgs)
+                                  .map(
+                                    ([key, value]) =>
+                                      `${key}: ${JSON.stringify(value)}`
+                                  )
+                                  .join(", ")}
+                              </div>
+                            )}
+                          </div>
+                        );
+                      }
+                    )
+                  ) : (
+                    <div className="p-3 border border-input rounded-md bg-muted text-center">
+                      <p className="text-sm text-muted-foreground">
+                        No tools configured
+                      </p>
+                      <p className="text-xs text-muted-foreground mt-1">
+                        This agent only has text generation capabilities
+                      </p>
+                    </div>
+                  )}
+                </div>
+                <p className="text-xs text-muted-foreground mt-2">
+                  Tools are configured when creating an agent and provide
+                  additional capabilities like web search, math calculations, or
+                  RAG document retrieval.
+                </p>
+              </div>
+            </div>
+          </div>
+        </div>
+
+        {/* Right Column - Chat Interface */}
+        <div className="flex-1 flex flex-col min-h-0 p-4 border border-border rounded-lg bg-background">
+          {error && (
+            <div className="mb-4 p-3 bg-destructive/10 border border-destructive/20 rounded-md">
+              <p className="text-destructive text-sm">{error}</p>
+            </div>
+          )}
+
+          <Chat
+            className="flex-1"
+            messages={currentSession?.messages || []}
+            handleSubmit={handleSubmit}
+            input={input}
+            handleInputChange={handleInputChange}
+            isGenerating={isGenerating}
+            append={append}
+            suggestions={suggestions}
+            setMessages={messages =>
+              setCurrentSession(prev =>
+                prev ? { ...prev, messages, updatedAt: Date.now() } : prev
+              )
+            }
+          />
         </div>
       </div>
 
-      {modelsError && (
-        <div className="mb-4 p-3 bg-destructive/10 border border-destructive/20 rounded-md">
-          <p className="text-destructive text-sm">{modelsError}</p>
+      {/* Create Agent Modal */}
+      {showCreateAgent && (
+        <div className="fixed inset-0 bg-black/50 flex items-center justify-center z-50">
+          <Card className="w-[500px] p-6 space-y-4">
+            <h3 className="text-lg font-semibold">Create New Agent</h3>
+
+            <div className="space-y-4">
+              <div>
+                <label className="text-sm font-medium block mb-2">
+                  Agent Name (optional)
+                </label>
+                <Input
+                  value={newAgentName}
+                  onChange={e => setNewAgentName(e.target.value)}
+                  placeholder="My Custom Agent"
+                />
+              </div>
+
+              <div>
+                <label className="text-sm font-medium block mb-2">Model</label>
+                <Select value={selectedModel} onValueChange={setSelectedModel}>
+                  <SelectTrigger>
+                    <SelectValue placeholder="Select Model" />
+                  </SelectTrigger>
+                  <SelectContent>
+                    {models.map(model => (
+                      <SelectItem
+                        key={model.identifier}
+                        value={model.identifier}
+                      >
+                        {model.identifier}
+                      </SelectItem>
+                    ))}
+                  </SelectContent>
+                </Select>
+              </div>
+
+              <div>
+                <label className="text-sm font-medium block mb-2">
+                  System Instructions
+                </label>
+                <textarea
+                  value={newAgentInstructions}
+                  onChange={e => setNewAgentInstructions(e.target.value)}
+                  placeholder="You are a helpful assistant."
+                  className="w-full h-32 px-3 py-2 text-sm border border-input rounded-md resize-none focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2"
+                />
+              </div>
+
+              <div>
+                <label className="text-sm font-medium block mb-2">
+                  Tools (optional)
+                </label>
+                <label className="text-sm font-small block mb-2">
+                  NOTE: Tools are not yet implemented
+                </label>
+                <p className="text-xs text-muted-foreground mb-2">
+                  Available toolgroups: {availableToolgroups.length} found
+                </p>
+                <div className="space-y-2">
+                  {availableToolgroups.length === 0 ? (
+                    <p className="text-sm text-muted-foreground">
+                      Loading toolgroups...
+                    </p>
+                  ) : (
+                    availableToolgroups.map(toolgroup => (
+                      <label
+                        key={toolgroup.identifier}
+                        className="flex items-center space-x-2"
+                      >
+                        <input
+                          type="checkbox"
+                          checked={selectedToolgroups.includes(
+                            toolgroup.identifier
+                          )}
+                          onChange={e => {
+                            console.log(
+                              "Tool selection changed:",
+                              toolgroup.identifier,
+                              e.target.checked
+                            );
+                            if (e.target.checked) {
+                              setSelectedToolgroups(prev => {
+                                const newSelection = [
+                                  ...prev,
+                                  toolgroup.identifier,
+                                ];
+                                console.log(
+                                  "New selected toolgroups:",
+                                  newSelection
+                                );
+                                return newSelection;
+                              });
+                            } else {
+                              setSelectedToolgroups(prev => {
+                                const newSelection = prev.filter(
+                                  id => id !== toolgroup.identifier
+                                );
+                                console.log(
+                                  "New selected toolgroups:",
+                                  newSelection
+                                );
+                                return newSelection;
+                              });
+                            }
+                          }}
+                          className="rounded border-input"
+                        />
+                        <span className="text-sm">
+                          <code className="bg-muted px-1 rounded text-xs">
+                            {toolgroup.identifier}
+                          </code>
+                          <span className="text-muted-foreground ml-2">
+                            ({toolgroup.provider_id})
+                          </span>
+                        </span>
+                      </label>
+                    ))
+                  )}
+                </div>
+                {selectedToolgroups.length === 0 && (
+                  <p className="text-xs text-muted-foreground mt-1">
+                    No tools selected - agent will only have text generation
+                    capabilities.
+                  </p>
+                )}
+                <p className="text-xs text-muted-foreground mt-2 p-2 bg-muted/50 border border-border rounded">
+                  <strong>Note:</strong> Selected tools will be configured for
+                  the agent. Some tools like RAG may require additional vector
+                  DB configuration, and web search tools need API keys. Basic
+                  text generation agents work without tools.
+                </p>
+              </div>
+            </div>
+
+            <div className="flex gap-2 pt-4">
+              <Button
+                onClick={async () => {
+                  try {
+                    await createNewAgent(
+                      newAgentName,
+                      newAgentInstructions,
+                      selectedModel,
+                      selectedToolgroups
+                    );
+                    setShowCreateAgent(false);
+                    setNewAgentName("");
+                    setNewAgentInstructions("You are a helpful assistant.");
+                    setSelectedToolgroups([]);
+                  } catch (error) {
+                    console.error("Failed to create agent:", error);
+                  }
+                }}
+                className="flex-1"
+                disabled={!selectedModel || !newAgentInstructions.trim()}
+              >
+                Create Agent
+              </Button>
+              <Button
+                variant="outline"
+                onClick={() => {
+                  setShowCreateAgent(false);
+                  setNewAgentName("");
+                  setNewAgentInstructions("You are a helpful assistant.");
+                  setSelectedToolgroups([]);
+                }}
+                className="flex-1"
+              >
+                Cancel
+              </Button>
+            </div>
+          </Card>
         </div>
       )}
-
-      {error && (
-        <div className="mb-4 p-3 bg-destructive/10 border border-destructive/20 rounded-md">
-          <p className="text-destructive text-sm">{error}</p>
-        </div>
-      )}
-
-      <Chat
-        className="flex-1"
-        messages={messages}
-        handleSubmit={handleSubmit}
-        input={input}
-        handleInputChange={handleInputChange}
-        isGenerating={isGenerating}
-        append={append}
-        suggestions={suggestions}
-        setMessages={setMessages}
-      />
     </div>
   );
 }
diff --git a/llama_stack/ui/app/favicon.ico b/llama_stack/ui/app/favicon.ico
deleted file mode 100644
index 718d6fea4..000000000
Binary files a/llama_stack/ui/app/favicon.ico and /dev/null differ
diff --git a/llama_stack/ui/app/globals.css b/llama_stack/ui/app/globals.css
index dc98be74c..000dad718 100644
--- a/llama_stack/ui/app/globals.css
+++ b/llama_stack/ui/app/globals.css
@@ -120,3 +120,44 @@
     @apply bg-background text-foreground;
   }
 }
+
+@layer utilities {
+  .animate-typing-dot-1 {
+    animation: typing-dot-bounce-1 0.8s cubic-bezier(0.4, 0, 0.6, 1) infinite;
+  }
+
+  .animate-typing-dot-2 {
+    animation: typing-dot-bounce-2 0.8s cubic-bezier(0.4, 0, 0.6, 1) infinite;
+  }
+
+  .animate-typing-dot-3 {
+    animation: typing-dot-bounce-3 0.8s cubic-bezier(0.4, 0, 0.6, 1) infinite;
+  }
+
+  @keyframes typing-dot-bounce-1 {
+    0%, 15%, 85%, 100% {
+      transform: translateY(0);
+    }
+    7.5% {
+      transform: translateY(-6px);
+    }
+  }
+
+  @keyframes typing-dot-bounce-2 {
+    0%, 15%, 35%, 85%, 100% {
+      transform: translateY(0);
+    }
+    25% {
+      transform: translateY(-6px);
+    }
+  }
+
+  @keyframes typing-dot-bounce-3 {
+    0%, 35%, 55%, 85%, 100% {
+      transform: translateY(0);
+    }
+    45% {
+      transform: translateY(-6px);
+    }
+  }
+}
diff --git a/llama_stack/ui/app/layout.tsx b/llama_stack/ui/app/layout.tsx
index 19fb18c36..8b91341e4 100644
--- a/llama_stack/ui/app/layout.tsx
+++ b/llama_stack/ui/app/layout.tsx
@@ -18,6 +18,9 @@ const geistMono = Geist_Mono({
 export const metadata: Metadata = {
   title: "Llama Stack",
   description: "Llama Stack UI",
+  icons: {
+    icon: "/favicon.ico",
+  },
 };
 
 import { SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
diff --git a/llama_stack/ui/components/chat-playground/chat-message.tsx b/llama_stack/ui/components/chat-playground/chat-message.tsx
index 84c798e29..3545e6a29 100644
--- a/llama_stack/ui/components/chat-playground/chat-message.tsx
+++ b/llama_stack/ui/components/chat-playground/chat-message.tsx
@@ -161,10 +161,12 @@ export const ChatMessage: React.FC<ChatMessageProps> = ({
 
   const isUser = role === "user";
 
-  const formattedTime = createdAt?.toLocaleTimeString("en-US", {
-    hour: "2-digit",
-    minute: "2-digit",
-  });
+  const formattedTime = createdAt
+    ? new Date(createdAt).toLocaleTimeString("en-US", {
+        hour: "2-digit",
+        minute: "2-digit",
+      })
+    : undefined;
 
   if (isUser) {
     return (
@@ -185,7 +187,7 @@ export const ChatMessage: React.FC<ChatMessageProps> = ({
 
         {showTimeStamp && createdAt ? (
           <time
-            dateTime={createdAt.toISOString()}
+            dateTime={new Date(createdAt).toISOString()}
             className={cn(
               "mt-1 block px-1 text-xs opacity-50",
               animation !== "none" && "duration-500 animate-in fade-in-0"
@@ -220,7 +222,7 @@ export const ChatMessage: React.FC<ChatMessageProps> = ({
 
             {showTimeStamp && createdAt ? (
               <time
-                dateTime={createdAt.toISOString()}
+                dateTime={new Date(createdAt).toISOString()}
                 className={cn(
                   "mt-1 block px-1 text-xs opacity-50",
                   animation !== "none" && "duration-500 animate-in fade-in-0"
@@ -262,7 +264,7 @@ export const ChatMessage: React.FC<ChatMessageProps> = ({
 
       {showTimeStamp && createdAt ? (
         <time
-          dateTime={createdAt.toISOString()}
+          dateTime={new Date(createdAt).toISOString()}
           className={cn(
             "mt-1 block px-1 text-xs opacity-50",
             animation !== "none" && "duration-500 animate-in fade-in-0"
diff --git a/llama_stack/ui/components/chat-playground/conversations.test.tsx b/llama_stack/ui/components/chat-playground/conversations.test.tsx
new file mode 100644
index 000000000..f4172004a
--- /dev/null
+++ b/llama_stack/ui/components/chat-playground/conversations.test.tsx
@@ -0,0 +1,345 @@
+import React from "react";
+import { render, screen, waitFor, act } from "@testing-library/react";
+import "@testing-library/jest-dom";
+import { Conversations, SessionUtils } from "./conversations";
+import type { Message } from "@/components/chat-playground/chat-message";
+
+interface ChatSession {
+  id: string;
+  name: string;
+  messages: Message[];
+  selectedModel: string;
+  systemMessage: string;
+  agentId: string;
+  createdAt: number;
+  updatedAt: number;
+}
+
+const mockOnSessionChange = jest.fn();
+const mockOnNewSession = jest.fn();
+
+// Mock the auth client
+const mockClient = {
+  agents: {
+    session: {
+      list: jest.fn(),
+      create: jest.fn(),
+      delete: jest.fn(),
+      retrieve: jest.fn(),
+    },
+  },
+};
+
+// Mock the useAuthClient hook
+jest.mock("@/hooks/use-auth-client", () => ({
+  useAuthClient: jest.fn(() => mockClient),
+}));
+
+// Mock additional SessionUtils methods that are now being used
+jest.mock("./conversations", () => {
+  const actual = jest.requireActual("./conversations");
+  return {
+    ...actual,
+    SessionUtils: {
+      ...actual.SessionUtils,
+      saveSessionData: jest.fn(),
+      loadSessionData: jest.fn(),
+      saveAgentConfig: jest.fn(),
+      loadAgentConfig: jest.fn(),
+      clearAgentCache: jest.fn(),
+    },
+  };
+});
+
+const localStorageMock = {
+  getItem: jest.fn(),
+  setItem: jest.fn(),
+  removeItem: jest.fn(),
+  clear: jest.fn(),
+};
+
+Object.defineProperty(window, "localStorage", {
+  value: localStorageMock,
+  writable: true,
+});
+
+// Mock crypto.randomUUID for test environment
+let uuidCounter = 0;
+Object.defineProperty(globalThis, "crypto", {
+  value: {
+    randomUUID: jest.fn(() => `test-uuid-${++uuidCounter}`),
+  },
+  writable: true,
+});
+
+describe("SessionManager", () => {
+  const mockSession: ChatSession = {
+    id: "session_123",
+    name: "Test Session",
+    messages: [
+      {
+        id: "msg_1",
+        role: "user",
+        content: "Hello",
+        createdAt: new Date(),
+      },
+    ],
+    selectedModel: "test-model",
+    systemMessage: "You are a helpful assistant.",
+    agentId: "agent_123",
+    createdAt: 1710000000,
+    updatedAt: 1710001000,
+  };
+
+  const mockAgentSessions = [
+    {
+      session_id: "session_123",
+      session_name: "Test Session",
+      started_at: "2024-01-01T00:00:00Z",
+      turns: [],
+    },
+    {
+      session_id: "session_456",
+      session_name: "Another Session",
+      started_at: "2024-01-01T01:00:00Z",
+      turns: [],
+    },
+  ];
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    localStorageMock.getItem.mockReturnValue(null);
+    localStorageMock.setItem.mockImplementation(() => {});
+    mockClient.agents.session.list.mockResolvedValue({
+      data: mockAgentSessions,
+    });
+    mockClient.agents.session.create.mockResolvedValue({
+      session_id: "new_session_123",
+    });
+    mockClient.agents.session.delete.mockResolvedValue(undefined);
+    mockClient.agents.session.retrieve.mockResolvedValue({
+      session_id: "test-session",
+      session_name: "Test Session",
+      started_at: new Date().toISOString(),
+      turns: [],
+    });
+    uuidCounter = 0; // Reset UUID counter for consistent test behavior
+  });
+
+  describe("Component Rendering", () => {
+    test("does not render when no agent is selected", async () => {
+      const { container } = await act(async () => {
+        return render(
+          <Conversations
+            selectedAgentId=""
+            currentSession={null}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      expect(container.firstChild).toBeNull();
+    });
+
+    test("renders loading state initially", async () => {
+      mockClient.agents.session.list.mockImplementation(
+        () => new Promise(() => {}) // Never resolves to simulate loading
+      );
+
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={null}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      expect(screen.getByText("Select Session")).toBeInTheDocument();
+      // When loading, the "+ New" button should be disabled
+      expect(screen.getByText("+ New")).toBeDisabled();
+    });
+
+    test("renders session selector when agent sessions are loaded", async () => {
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={null}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      await waitFor(() => {
+        expect(screen.getByText("Select Session")).toBeInTheDocument();
+      });
+    });
+
+    test("renders current session name when session is selected", async () => {
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={mockSession}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      await waitFor(() => {
+        expect(screen.getByText("Test Session")).toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("Agent API Integration", () => {
+    test("loads sessions from agent API on mount", async () => {
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={mockSession}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      await waitFor(() => {
+        expect(mockClient.agents.session.list).toHaveBeenCalledWith(
+          "agent_123"
+        );
+      });
+    });
+
+    test("handles API errors gracefully", async () => {
+      mockClient.agents.session.list.mockRejectedValue(new Error("API Error"));
+      const consoleSpy = jest
+        .spyOn(console, "error")
+        .mockImplementation(() => {});
+
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={mockSession}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      await waitFor(() => {
+        expect(consoleSpy).toHaveBeenCalledWith(
+          "Error loading agent sessions:",
+          expect.any(Error)
+        );
+      });
+
+      consoleSpy.mockRestore();
+    });
+  });
+
+  describe("Error Handling", () => {
+    test("component renders without crashing when API is unavailable", async () => {
+      mockClient.agents.session.list.mockRejectedValue(
+        new Error("Network Error")
+      );
+      const consoleSpy = jest
+        .spyOn(console, "error")
+        .mockImplementation(() => {});
+
+      await act(async () => {
+        render(
+          <Conversations
+            selectedAgentId="agent_123"
+            currentSession={mockSession}
+            onSessionChange={mockOnSessionChange}
+            onNewSession={mockOnNewSession}
+          />
+        );
+      });
+
+      // Should still render the session manager with the select trigger
+      expect(screen.getByRole("combobox")).toBeInTheDocument();
+      expect(screen.getByText("+ New")).toBeInTheDocument();
+      consoleSpy.mockRestore();
+    });
+  });
+});
+
+describe("SessionUtils", () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    localStorageMock.getItem.mockReturnValue(null);
+    localStorageMock.setItem.mockImplementation(() => {});
+  });
+
+  describe("saveCurrentSessionId", () => {
+    test("saves session ID to localStorage", () => {
+      SessionUtils.saveCurrentSessionId("test-session-id");
+
+      expect(localStorageMock.setItem).toHaveBeenCalledWith(
+        "chat-playground-current-session",
+        "test-session-id"
+      );
+    });
+  });
+
+  describe("createDefaultSession", () => {
+    test("creates default session with agent ID", () => {
+      const result = SessionUtils.createDefaultSession("agent_123");
+
+      expect(result).toEqual(
+        expect.objectContaining({
+          name: "Default Session",
+          messages: [],
+          selectedModel: "",
+          systemMessage: "You are a helpful assistant.",
+          agentId: "agent_123",
+        })
+      );
+      expect(result.id).toBeTruthy();
+      expect(result.createdAt).toBeTruthy();
+      expect(result.updatedAt).toBeTruthy();
+    });
+
+    test("creates default session with inherited model", () => {
+      const result = SessionUtils.createDefaultSession(
+        "agent_123",
+        "inherited-model"
+      );
+
+      expect(result.selectedModel).toBe("inherited-model");
+      expect(result.agentId).toBe("agent_123");
+    });
+
+    test("creates unique session IDs", () => {
+      const originalNow = Date.now;
+      let mockTime = 1710005000;
+      Date.now = jest.fn(() => ++mockTime);
+
+      const session1 = SessionUtils.createDefaultSession("agent_123");
+      const session2 = SessionUtils.createDefaultSession("agent_123");
+
+      expect(session1.id).not.toBe(session2.id);
+
+      Date.now = originalNow;
+    });
+
+    test("sets creation and update timestamps", () => {
+      const result = SessionUtils.createDefaultSession("agent_123");
+
+      expect(result.createdAt).toBeTruthy();
+      expect(result.updatedAt).toBeTruthy();
+      expect(typeof result.createdAt).toBe("number");
+      expect(typeof result.updatedAt).toBe("number");
+    });
+  });
+});
diff --git a/llama_stack/ui/components/chat-playground/conversations.tsx b/llama_stack/ui/components/chat-playground/conversations.tsx
new file mode 100644
index 000000000..1a9c960fe
--- /dev/null
+++ b/llama_stack/ui/components/chat-playground/conversations.tsx
@@ -0,0 +1,568 @@
+"use client";
+
+import { useState, useEffect, useCallback } from "react";
+import { Button } from "@/components/ui/button";
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "@/components/ui/select";
+import { Input } from "@/components/ui/input";
+import { Card } from "@/components/ui/card";
+import { Trash2 } from "lucide-react";
+import type { Message } from "@/components/chat-playground/chat-message";
+import { useAuthClient } from "@/hooks/use-auth-client";
+import type {
+  Session,
+  SessionCreateParams,
+} from "llama-stack-client/resources/agents";
+
+export interface ChatSession {
+  id: string;
+  name: string;
+  messages: Message[];
+  selectedModel: string;
+  systemMessage: string;
+  agentId: string;
+  session?: Session;
+  createdAt: number;
+  updatedAt: number;
+}
+
+interface SessionManagerProps {
+  currentSession: ChatSession | null;
+  onSessionChange: (session: ChatSession) => void;
+  onNewSession: () => void;
+  selectedAgentId: string;
+}
+
+const CURRENT_SESSION_KEY = "chat-playground-current-session";
+
+// ensures this only happens client side
+const safeLocalStorage = {
+  getItem: (key: string): string | null => {
+    if (typeof window === "undefined") return null;
+    try {
+      return localStorage.getItem(key);
+    } catch (err) {
+      console.error("Error accessing localStorage:", err);
+      return null;
+    }
+  },
+  setItem: (key: string, value: string): void => {
+    if (typeof window === "undefined") return;
+    try {
+      localStorage.setItem(key, value);
+    } catch (err) {
+      console.error("Error writing to localStorage:", err);
+    }
+  },
+  removeItem: (key: string): void => {
+    if (typeof window === "undefined") return;
+    try {
+      localStorage.removeItem(key);
+    } catch (err) {
+      console.error("Error removing from localStorage:", err);
+    }
+  },
+};
+
+const generateSessionId = (): string => {
+  return globalThis.crypto.randomUUID();
+};
+
+export function Conversations({
+  currentSession,
+  onSessionChange,
+  selectedAgentId,
+}: SessionManagerProps) {
+  const [sessions, setSessions] = useState<ChatSession[]>([]);
+  const [showCreateForm, setShowCreateForm] = useState(false);
+  const [newSessionName, setNewSessionName] = useState("");
+  const [loading, setLoading] = useState(false);
+  const client = useAuthClient();
+
+  const loadAgentSessions = useCallback(async () => {
+    if (!selectedAgentId) return;
+
+    setLoading(true);
+    try {
+      const response = await client.agents.session.list(selectedAgentId);
+      console.log("Sessions response:", response);
+
+      if (!response.data || !Array.isArray(response.data)) {
+        console.warn("Invalid sessions response, starting fresh");
+        setSessions([]);
+        return;
+      }
+
+      const agentSessions: ChatSession[] = response.data
+        .filter(sessionData => {
+          const isValid =
+            sessionData &&
+            typeof sessionData === "object" &&
+            sessionData.session_id &&
+            sessionData.session_name;
+          if (!isValid) {
+            console.warn("Filtering out invalid session:", sessionData);
+          }
+          return isValid;
+        })
+        .map(sessionData => ({
+          id: sessionData.session_id,
+          name: sessionData.session_name,
+          messages: [],
+          selectedModel: currentSession?.selectedModel || "",
+          systemMessage:
+            currentSession?.systemMessage || "You are a helpful assistant.",
+          agentId: selectedAgentId,
+          session: sessionData,
+          createdAt: sessionData.started_at
+            ? new Date(sessionData.started_at).getTime()
+            : Date.now(),
+          updatedAt: sessionData.started_at
+            ? new Date(sessionData.started_at).getTime()
+            : Date.now(),
+        }));
+      setSessions(agentSessions);
+    } catch (error) {
+      console.error("Error loading agent sessions:", error);
+      setSessions([]);
+    } finally {
+      setLoading(false);
+    }
+  }, [
+    selectedAgentId,
+    client,
+    currentSession?.selectedModel,
+    currentSession?.systemMessage,
+  ]);
+
+  useEffect(() => {
+    if (selectedAgentId) {
+      loadAgentSessions();
+    }
+  }, [selectedAgentId, loadAgentSessions]);
+
+  const createNewSession = async () => {
+    if (!selectedAgentId) return;
+
+    const sessionName =
+      newSessionName.trim() || `Session ${sessions.length + 1}`;
+    setLoading(true);
+
+    try {
+      const response = await client.agents.session.create(selectedAgentId, {
+        session_name: sessionName,
+      } as SessionCreateParams);
+
+      const newSession: ChatSession = {
+        id: response.session_id,
+        name: sessionName,
+        messages: [],
+        selectedModel: currentSession?.selectedModel || "",
+        systemMessage:
+          currentSession?.systemMessage || "You are a helpful assistant.",
+        agentId: selectedAgentId,
+        createdAt: Date.now(),
+        updatedAt: Date.now(),
+      };
+
+      setSessions(prev => [...prev, newSession]);
+      SessionUtils.saveCurrentSessionId(newSession.id, selectedAgentId);
+      onSessionChange(newSession);
+
+      setNewSessionName("");
+      setShowCreateForm(false);
+    } catch (error) {
+      console.error("Error creating session:", error);
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  const loadSessionMessages = useCallback(
+    async (agentId: string, sessionId: string): Promise<Message[]> => {
+      try {
+        const session = await client.agents.session.retrieve(
+          agentId,
+          sessionId
+        );
+
+        if (!session || !session.turns || !Array.isArray(session.turns)) {
+          return [];
+        }
+
+        const messages: Message[] = [];
+        for (const turn of session.turns) {
+          // Add user messages from input_messages
+          if (turn.input_messages && Array.isArray(turn.input_messages)) {
+            for (const input of turn.input_messages) {
+              if (input.role === "user" && input.content) {
+                messages.push({
+                  id: `${turn.turn_id}-user-${messages.length}`,
+                  role: "user",
+                  content:
+                    typeof input.content === "string"
+                      ? input.content
+                      : JSON.stringify(input.content),
+                  createdAt: new Date(turn.started_at || Date.now()),
+                });
+              }
+            }
+          }
+
+          // Add assistant message from output_message
+          if (turn.output_message && turn.output_message.content) {
+            messages.push({
+              id: `${turn.turn_id}-assistant-${messages.length}`,
+              role: "assistant",
+              content:
+                typeof turn.output_message.content === "string"
+                  ? turn.output_message.content
+                  : JSON.stringify(turn.output_message.content),
+              createdAt: new Date(
+                turn.completed_at || turn.started_at || Date.now()
+              ),
+            });
+          }
+        }
+
+        return messages;
+      } catch (error) {
+        console.error("Error loading session messages:", error);
+        return [];
+      }
+    },
+    [client]
+  );
+
+  const switchToSession = useCallback(
+    async (sessionId: string) => {
+      const session = sessions.find(s => s.id === sessionId);
+      if (session) {
+        setLoading(true);
+        try {
+          // Load messages for this session
+          const messages = await loadSessionMessages(
+            selectedAgentId,
+            sessionId
+          );
+          const sessionWithMessages = {
+            ...session,
+            messages,
+          };
+
+          SessionUtils.saveCurrentSessionId(sessionId, selectedAgentId);
+          onSessionChange(sessionWithMessages);
+        } catch (error) {
+          console.error("Error switching to session:", error);
+          // Fallback to session without messages
+          SessionUtils.saveCurrentSessionId(sessionId, selectedAgentId);
+          onSessionChange(session);
+        } finally {
+          setLoading(false);
+        }
+      }
+    },
+    [sessions, selectedAgentId, loadSessionMessages, onSessionChange]
+  );
+
+  const deleteSession = async (sessionId: string) => {
+    if (sessions.length <= 1 || !selectedAgentId) {
+      return;
+    }
+
+    if (
+      confirm(
+        "Are you sure you want to delete this session? This action cannot be undone."
+      )
+    ) {
+      setLoading(true);
+      try {
+        await client.agents.session.delete(selectedAgentId, sessionId);
+
+        const updatedSessions = sessions.filter(s => s.id !== sessionId);
+        setSessions(updatedSessions);
+
+        if (currentSession?.id === sessionId) {
+          const newCurrentSession = updatedSessions[0] || null;
+          if (newCurrentSession) {
+            SessionUtils.saveCurrentSessionId(
+              newCurrentSession.id,
+              selectedAgentId
+            );
+            onSessionChange(newCurrentSession);
+          } else {
+            SessionUtils.clearCurrentSession(selectedAgentId);
+            onNewSession();
+          }
+        }
+      } catch (error) {
+        console.error("Error deleting session:", error);
+      } finally {
+        setLoading(false);
+      }
+    }
+  };
+
+  useEffect(() => {
+    if (currentSession) {
+      setSessions(prevSessions => {
+        const updatedSessions = prevSessions.map(session =>
+          session.id === currentSession.id ? currentSession : session
+        );
+
+        if (!prevSessions.find(s => s.id === currentSession.id)) {
+          updatedSessions.push(currentSession);
+        }
+
+        return updatedSessions;
+      });
+    }
+  }, [currentSession]);
+
+  // Don't render if no agent is selected
+  if (!selectedAgentId) {
+    return null;
+  }
+
+  return (
+    <div className="relative">
+      <div className="flex items-center gap-2">
+        <Select
+          value={currentSession?.id || ""}
+          onValueChange={switchToSession}
+        >
+          <SelectTrigger className="w-[200px]">
+            <SelectValue placeholder="Select Session" />
+          </SelectTrigger>
+          <SelectContent>
+            {sessions.map(session => (
+              <SelectItem key={session.id} value={session.id}>
+                {session.name}
+              </SelectItem>
+            ))}
+          </SelectContent>
+        </Select>
+
+        <Button
+          onClick={() => setShowCreateForm(true)}
+          variant="outline"
+          size="sm"
+          disabled={loading || !selectedAgentId}
+        >
+          + New
+        </Button>
+
+        {currentSession && sessions.length > 1 && (
+          <Button
+            onClick={() => deleteSession(currentSession.id)}
+            variant="outline"
+            size="sm"
+            className="text-destructive hover:text-destructive hover:bg-destructive/10"
+            title="Delete current session"
+          >
+            <Trash2 className="h-3 w-3" />
+          </Button>
+        )}
+      </div>
+
+      {showCreateForm && (
+        <Card className="absolute top-full left-0 mt-2 p-4 space-y-3 w-80 z-50 bg-background border shadow-lg">
+          <h3 className="text-md font-semibold">Create New Session</h3>
+
+          <Input
+            value={newSessionName}
+            onChange={e => setNewSessionName(e.target.value)}
+            placeholder="Session name (optional)"
+            onKeyDown={e => {
+              if (e.key === "Enter") {
+                createNewSession();
+              } else if (e.key === "Escape") {
+                setShowCreateForm(false);
+                setNewSessionName("");
+              }
+            }}
+          />
+
+          <div className="flex gap-2">
+            <Button
+              onClick={createNewSession}
+              className="flex-1"
+              disabled={loading}
+            >
+              {loading ? "Creating..." : "Create"}
+            </Button>
+            <Button
+              variant="outline"
+              onClick={() => {
+                setShowCreateForm(false);
+                setNewSessionName("");
+              }}
+              className="flex-1"
+            >
+              Cancel
+            </Button>
+          </div>
+        </Card>
+      )}
+
+      {currentSession && sessions.length > 1 && (
+        <div className="absolute top-full left-0 mt-1 text-xs text-gray-500 whitespace-nowrap">
+          {sessions.length} sessions • Current: {currentSession.name}
+          {currentSession.messages.length > 0 &&
+            ` • ${currentSession.messages.length} messages`}
+        </div>
+      )}
+    </div>
+  );
+}
+
+export const SessionUtils = {
+  loadCurrentSessionId: (agentId?: string): string | null => {
+    const key = agentId
+      ? `${CURRENT_SESSION_KEY}-${agentId}`
+      : CURRENT_SESSION_KEY;
+    return safeLocalStorage.getItem(key);
+  },
+
+  saveCurrentSessionId: (sessionId: string, agentId?: string) => {
+    const key = agentId
+      ? `${CURRENT_SESSION_KEY}-${agentId}`
+      : CURRENT_SESSION_KEY;
+    safeLocalStorage.setItem(key, sessionId);
+  },
+
+  createDefaultSession: (
+    agentId: string,
+    inheritModel?: string
+  ): ChatSession => ({
+    id: generateSessionId(),
+    name: "Default Session",
+    messages: [],
+    selectedModel: inheritModel || "",
+    systemMessage: "You are a helpful assistant.",
+    agentId,
+    createdAt: Date.now(),
+    updatedAt: Date.now(),
+  }),
+
+  clearCurrentSession: (agentId?: string) => {
+    const key = agentId
+      ? `${CURRENT_SESSION_KEY}-${agentId}`
+      : CURRENT_SESSION_KEY;
+    safeLocalStorage.removeItem(key);
+  },
+
+  loadCurrentAgentId: (): string | null => {
+    return safeLocalStorage.getItem("chat-playground-current-agent");
+  },
+
+  saveCurrentAgentId: (agentId: string) => {
+    safeLocalStorage.setItem("chat-playground-current-agent", agentId);
+  },
+
+  // Comprehensive session caching
+  saveSessionData: (agentId: string, sessionData: ChatSession) => {
+    const key = `chat-playground-session-data-${agentId}-${sessionData.id}`;
+    safeLocalStorage.setItem(
+      key,
+      JSON.stringify({
+        ...sessionData,
+        cachedAt: Date.now(),
+      })
+    );
+  },
+
+  loadSessionData: (agentId: string, sessionId: string): ChatSession | null => {
+    const key = `chat-playground-session-data-${agentId}-${sessionId}`;
+    const cached = safeLocalStorage.getItem(key);
+    if (!cached) return null;
+
+    try {
+      const data = JSON.parse(cached);
+      // Check if cache is fresh (less than 1 hour old)
+      const cacheAge = Date.now() - (data.cachedAt || 0);
+      if (cacheAge > 60 * 60 * 1000) {
+        safeLocalStorage.removeItem(key);
+        return null;
+      }
+
+      // Convert date strings back to Date objects
+      return {
+        ...data,
+        messages: data.messages.map(
+          (msg: { createdAt: string; [key: string]: unknown }) => ({
+            ...msg,
+            createdAt: new Date(msg.createdAt),
+          })
+        ),
+      };
+    } catch (error) {
+      console.error("Error parsing cached session data:", error);
+      safeLocalStorage.removeItem(key);
+      return null;
+    }
+  },
+
+  // Agent config caching
+  saveAgentConfig: (
+    agentId: string,
+    config: {
+      toolgroups?: Array<
+        string | { name: string; args: Record<string, unknown> }
+      >;
+      [key: string]: unknown;
+    }
+  ) => {
+    const key = `chat-playground-agent-config-${agentId}`;
+    safeLocalStorage.setItem(
+      key,
+      JSON.stringify({
+        config,
+        cachedAt: Date.now(),
+      })
+    );
+  },
+
+  loadAgentConfig: (
+    agentId: string
+  ): {
+    toolgroups?: Array<
+      string | { name: string; args: Record<string, unknown> }
+    >;
+    [key: string]: unknown;
+  } | null => {
+    const key = `chat-playground-agent-config-${agentId}`;
+    const cached = safeLocalStorage.getItem(key);
+    if (!cached) return null;
+
+    try {
+      const data = JSON.parse(cached);
+      // Check if cache is fresh (less than 30 minutes old)
+      const cacheAge = Date.now() - (data.cachedAt || 0);
+      if (cacheAge > 30 * 60 * 1000) {
+        safeLocalStorage.removeItem(key);
+        return null;
+      }
+      return data.config;
+    } catch (error) {
+      console.error("Error parsing cached agent config:", error);
+      safeLocalStorage.removeItem(key);
+      return null;
+    }
+  },
+
+  // Clear all cached data for an agent
+  clearAgentCache: (agentId: string) => {
+    const keys = Object.keys(localStorage).filter(
+      key =>
+        key.includes(`chat-playground-session-data-${agentId}`) ||
+        key.includes(`chat-playground-agent-config-${agentId}`)
+    );
+    keys.forEach(key => safeLocalStorage.removeItem(key));
+  },
+};
diff --git a/llama_stack/ui/components/chat-playground/typing-indicator.tsx b/llama_stack/ui/components/chat-playground/typing-indicator.tsx
index 8950c066b..3b5a560b7 100644
--- a/llama_stack/ui/components/chat-playground/typing-indicator.tsx
+++ b/llama_stack/ui/components/chat-playground/typing-indicator.tsx
@@ -5,9 +5,9 @@ export function TypingIndicator() {
     <div className="justify-left flex space-x-1">
       <div className="rounded-lg bg-muted p-3">
         <div className="flex -space-x-2.5">
-          <Dot className="h-5 w-5 animate-typing-dot-bounce" />
-          <Dot className="h-5 w-5 animate-typing-dot-bounce [animation-delay:90ms]" />
-          <Dot className="h-5 w-5 animate-typing-dot-bounce [animation-delay:180ms]" />
+          <Dot className="h-5 w-5 animate-typing-dot-1" />
+          <Dot className="h-5 w-5 animate-typing-dot-2" />
+          <Dot className="h-5 w-5 animate-typing-dot-3" />
         </div>
       </div>
     </div>
diff --git a/llama_stack/ui/components/layout/app-sidebar.tsx b/llama_stack/ui/components/layout/app-sidebar.tsx
index bee3d6a70..373f0c5ae 100644
--- a/llama_stack/ui/components/layout/app-sidebar.tsx
+++ b/llama_stack/ui/components/layout/app-sidebar.tsx
@@ -11,6 +11,7 @@ import {
 } from "lucide-react";
 import Link from "next/link";
 import { usePathname } from "next/navigation";
+import Image from "next/image";
 import { cn } from "@/lib/utils";
 
 import {
@@ -110,7 +111,16 @@ export function AppSidebar() {
   return (
     <Sidebar>
       <SidebarHeader>
-        <Link href="/">Llama Stack</Link>
+        <Link href="/" className="flex items-center gap-2 p-2">
+          <Image
+            src="/logo.webp"
+            alt="Llama Stack"
+            width={32}
+            height={32}
+            className="h-8 w-8"
+          />
+          <span className="font-semibold text-lg">Llama Stack</span>
+        </Link>
       </SidebarHeader>
       <SidebarContent>
         <SidebarGroup>
diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json
index bc6263732..190809533 100644
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@@ -8,17 +8,17 @@
       "name": "ui",
       "version": "0.1.0",
       "dependencies": {
-        "@radix-ui/react-collapsible": "^1.1.11",
+        "@radix-ui/react-collapsible": "^1.1.12",
         "@radix-ui/react-dialog": "^1.1.13",
         "@radix-ui/react-dropdown-menu": "^2.1.14",
         "@radix-ui/react-select": "^2.2.5",
-        "@radix-ui/react-separator": "^1.1.6",
+        "@radix-ui/react-separator": "^1.1.7",
         "@radix-ui/react-slot": "^1.2.3",
         "@radix-ui/react-tooltip": "^1.2.6",
         "class-variance-authority": "^0.7.1",
         "clsx": "^2.1.1",
         "framer-motion": "^11.18.2",
-        "llama-stack-client": "0.2.17",
+        "llama-stack-client": "^0.2.18",
         "lucide-react": "^0.510.0",
         "next": "15.3.3",
         "next-auth": "^4.24.11",
@@ -30,7 +30,7 @@
         "remeda": "^2.26.1",
         "shiki": "^1.29.2",
         "sonner": "^2.0.6",
-        "tailwind-merge": "^3.3.0"
+        "tailwind-merge": "^3.3.1"
       },
       "devDependencies": {
         "@eslint/eslintrc": "^3",
@@ -44,7 +44,7 @@
         "@types/react-dom": "^19",
         "eslint": "^9",
         "eslint-config-next": "15.3.2",
-        "eslint-config-prettier": "^10.1.5",
+        "eslint-config-prettier": "^10.1.8",
         "eslint-plugin-prettier": "^5.4.0",
         "jest": "^29.7.0",
         "jest-environment-jsdom": "^29.7.0",
@@ -2089,16 +2089,16 @@
       }
     },
     "node_modules/@radix-ui/react-collapsible": {
-      "version": "1.1.11",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-collapsible/-/react-collapsible-1.1.11.tgz",
-      "integrity": "sha512-2qrRsVGSCYasSz1RFOorXwl0H7g7J1frQtgpQgYrt+MOidtPAINHn9CPovQXb83r8ahapdx3Tu0fa/pdFFSdPg==",
+      "version": "1.1.12",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-collapsible/-/react-collapsible-1.1.12.tgz",
+      "integrity": "sha512-Uu+mSh4agx2ib1uIGPP4/CKNULyajb3p92LsVXmH2EHVMTfZWpll88XJ0j4W0z3f8NK1eYl1+Mf/szHPmcHzyA==",
       "license": "MIT",
       "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
         "@radix-ui/react-compose-refs": "1.1.2",
         "@radix-ui/react-context": "1.1.2",
         "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-presence": "1.1.4",
+        "@radix-ui/react-presence": "1.1.5",
         "@radix-ui/react-primitive": "2.1.3",
         "@radix-ui/react-use-controllable-state": "1.2.2",
         "@radix-ui/react-use-layout-effect": "1.1.1"
@@ -2118,6 +2118,36 @@
         }
       }
     },
+    "node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/primitive": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
+      "license": "MIT"
+    },
+    "node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-presence": {
+      "version": "1.1.5",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
+      "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-use-layout-effect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-primitive": {
       "version": "2.1.3",
       "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
@@ -2855,12 +2885,35 @@
       }
     },
     "node_modules/@radix-ui/react-separator": {
-      "version": "1.1.6",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-separator/-/react-separator-1.1.6.tgz",
-      "integrity": "sha512-Izof3lPpbCfTM7WDta+LRkz31jem890VjEvpVRoWQNKpDUMMVffuyq854XPGP1KYGWWmjmYvHvPFeocWhFCy1w==",
+      "version": "1.1.7",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-separator/-/react-separator-1.1.7.tgz",
+      "integrity": "sha512-0HEb8R9E8A+jZjvmFCy/J4xhbXy3TV+9XSnGJ3KvTtjlIUy/YQ/p6UYZvi7YbeoeXdyU9+Y3scizK6hkY37baA==",
       "license": "MIT",
       "dependencies": {
-        "@radix-ui/react-primitive": "2.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-separator/node_modules/@radix-ui/react-primitive": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-slot": "1.2.3"
       },
       "peerDependencies": {
         "@types/react": "*",
@@ -3947,17 +4000,17 @@
       "license": "MIT"
     },
     "node_modules/@typescript-eslint/eslint-plugin": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.32.1.tgz",
-      "integrity": "sha512-6u6Plg9nP/J1GRpe/vcjjabo6Uc5YQPAMxsgQyGC/I0RuukiG1wIe3+Vtg3IrSCVJDmqK3j8adrtzXSENRtFgg==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.40.0.tgz",
+      "integrity": "sha512-w/EboPlBwnmOBtRbiOvzjD+wdiZdgFeo17lkltrtn7X37vagKKWJABvyfsJXTlHe6XBzugmYgd4A4nW+k8Mixw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
         "@eslint-community/regexpp": "^4.10.0",
-        "@typescript-eslint/scope-manager": "8.32.1",
-        "@typescript-eslint/type-utils": "8.32.1",
-        "@typescript-eslint/utils": "8.32.1",
-        "@typescript-eslint/visitor-keys": "8.32.1",
+        "@typescript-eslint/scope-manager": "8.40.0",
+        "@typescript-eslint/type-utils": "8.40.0",
+        "@typescript-eslint/utils": "8.40.0",
+        "@typescript-eslint/visitor-keys": "8.40.0",
         "graphemer": "^1.4.0",
         "ignore": "^7.0.0",
         "natural-compare": "^1.4.0",
@@ -3971,15 +4024,15 @@
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "@typescript-eslint/parser": "^8.0.0 || ^8.0.0-alpha.0",
+        "@typescript-eslint/parser": "^8.40.0",
         "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <5.9.0"
+        "typescript": ">=4.8.4 <6.0.0"
       }
     },
     "node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.4.tgz",
-      "integrity": "sha512-gJzzk+PQNznz8ysRrC0aOkBNVRBDtE1n53IqyqEf3PXrYwomFs5q4pGMizBMJF+ykh03insJ27hB8gSrD2Hn8A==",
+      "version": "7.0.5",
+      "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz",
+      "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -3987,16 +4040,16 @@
       }
     },
     "node_modules/@typescript-eslint/parser": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.32.1.tgz",
-      "integrity": "sha512-LKMrmwCPoLhM45Z00O1ulb6jwyVr2kr3XJp+G+tSEZcbauNnScewcQwtJqXDhXeYPDEjZ8C1SjXm015CirEmGg==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.40.0.tgz",
+      "integrity": "sha512-jCNyAuXx8dr5KJMkecGmZ8KI61KBUhkCob+SD+C+I5+Y1FWI2Y3QmY4/cxMCC5WAsZqoEtEETVhUiUMIGCf6Bw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@typescript-eslint/scope-manager": "8.32.1",
-        "@typescript-eslint/types": "8.32.1",
-        "@typescript-eslint/typescript-estree": "8.32.1",
-        "@typescript-eslint/visitor-keys": "8.32.1",
+        "@typescript-eslint/scope-manager": "8.40.0",
+        "@typescript-eslint/types": "8.40.0",
+        "@typescript-eslint/typescript-estree": "8.40.0",
+        "@typescript-eslint/visitor-keys": "8.40.0",
         "debug": "^4.3.4"
       },
       "engines": {
@@ -4008,18 +4061,40 @@
       },
       "peerDependencies": {
         "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <5.9.0"
+        "typescript": ">=4.8.4 <6.0.0"
       }
     },
-    "node_modules/@typescript-eslint/scope-manager": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.32.1.tgz",
-      "integrity": "sha512-7IsIaIDeZn7kffk7qXC3o6Z4UblZJKV3UBpkvRNpr5NSyLji7tvTcvmnMNYuYLyh26mN8W723xpo3i4MlD33vA==",
+    "node_modules/@typescript-eslint/project-service": {
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.40.0.tgz",
+      "integrity": "sha512-/A89vz7Wf5DEXsGVvcGdYKbVM9F7DyFXj52lNYUDS1L9yJfqjW/fIp5PgMuEJL/KeqVTe2QSbXAGUZljDUpArw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@typescript-eslint/types": "8.32.1",
-        "@typescript-eslint/visitor-keys": "8.32.1"
+        "@typescript-eslint/tsconfig-utils": "^8.40.0",
+        "@typescript-eslint/types": "^8.40.0",
+        "debug": "^4.3.4"
+      },
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.8.4 <6.0.0"
+      }
+    },
+    "node_modules/@typescript-eslint/scope-manager": {
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.40.0.tgz",
+      "integrity": "sha512-y9ObStCcdCiZKzwqsE8CcpyuVMwRouJbbSrNuThDpv16dFAj429IkM6LNb1dZ2m7hK5fHyzNcErZf7CEeKXR4w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@typescript-eslint/types": "8.40.0",
+        "@typescript-eslint/visitor-keys": "8.40.0"
       },
       "engines": {
         "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@@ -4029,15 +4104,33 @@
         "url": "https://opencollective.com/typescript-eslint"
       }
     },
+    "node_modules/@typescript-eslint/tsconfig-utils": {
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.40.0.tgz",
+      "integrity": "sha512-jtMytmUaG9d/9kqSl/W3E3xaWESo4hFDxAIHGVW/WKKtQhesnRIJSAJO6XckluuJ6KDB5woD1EiqknriCtAmcw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "typescript": ">=4.8.4 <6.0.0"
+      }
+    },
     "node_modules/@typescript-eslint/type-utils": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.32.1.tgz",
-      "integrity": "sha512-mv9YpQGA8iIsl5KyUPi+FGLm7+bA4fgXaeRcFKRDRwDMu4iwrSHeDPipwueNXhdIIZltwCJv+NkxftECbIZWfA==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.40.0.tgz",
+      "integrity": "sha512-eE60cK4KzAc6ZrzlJnflXdrMqOBaugeukWICO2rB0KNvwdIMaEaYiywwHMzA1qFpTxrLhN9Lp4E/00EgWcD3Ow==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@typescript-eslint/typescript-estree": "8.32.1",
-        "@typescript-eslint/utils": "8.32.1",
+        "@typescript-eslint/types": "8.40.0",
+        "@typescript-eslint/typescript-estree": "8.40.0",
+        "@typescript-eslint/utils": "8.40.0",
         "debug": "^4.3.4",
         "ts-api-utils": "^2.1.0"
       },
@@ -4050,13 +4143,13 @@
       },
       "peerDependencies": {
         "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <5.9.0"
+        "typescript": ">=4.8.4 <6.0.0"
       }
     },
     "node_modules/@typescript-eslint/types": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.32.1.tgz",
-      "integrity": "sha512-YmybwXUJcgGqgAp6bEsgpPXEg6dcCyPyCSr0CAAueacR/CCBi25G3V8gGQ2kRzQRBNol7VQknxMs9HvVa9Rvfg==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.40.0.tgz",
+      "integrity": "sha512-ETdbFlgbAmXHyFPwqUIYrfc12ArvpBhEVgGAxVYSwli26dn8Ko+lIo4Su9vI9ykTZdJn+vJprs/0eZU0YMAEQg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -4068,14 +4161,16 @@
       }
     },
     "node_modules/@typescript-eslint/typescript-estree": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.32.1.tgz",
-      "integrity": "sha512-Y3AP9EIfYwBb4kWGb+simvPaqQoT5oJuzzj9m0i6FCY6SPvlomY2Ei4UEMm7+FXtlNJbor80ximyslzaQF6xhg==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.40.0.tgz",
+      "integrity": "sha512-k1z9+GJReVVOkc1WfVKs1vBrR5MIKKbdAjDTPvIK3L8De6KbFfPFt6BKpdkdk7rZS2GtC/m6yI5MYX+UsuvVYQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@typescript-eslint/types": "8.32.1",
-        "@typescript-eslint/visitor-keys": "8.32.1",
+        "@typescript-eslint/project-service": "8.40.0",
+        "@typescript-eslint/tsconfig-utils": "8.40.0",
+        "@typescript-eslint/types": "8.40.0",
+        "@typescript-eslint/visitor-keys": "8.40.0",
         "debug": "^4.3.4",
         "fast-glob": "^3.3.2",
         "is-glob": "^4.0.3",
@@ -4091,13 +4186,13 @@
         "url": "https://opencollective.com/typescript-eslint"
       },
       "peerDependencies": {
-        "typescript": ">=4.8.4 <5.9.0"
+        "typescript": ">=4.8.4 <6.0.0"
       }
     },
     "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
-      "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+      "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -4151,16 +4246,16 @@
       }
     },
     "node_modules/@typescript-eslint/utils": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.32.1.tgz",
-      "integrity": "sha512-DsSFNIgLSrc89gpq1LJB7Hm1YpuhK086DRDJSNrewcGvYloWW1vZLHBTIvarKZDcAORIy/uWNx8Gad+4oMpkSA==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.40.0.tgz",
+      "integrity": "sha512-Cgzi2MXSZyAUOY+BFwGs17s7ad/7L+gKt6Y8rAVVWS+7o6wrjeFN4nVfTpbE25MNcxyJ+iYUXflbs2xR9h4UBg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.7.0",
-        "@typescript-eslint/scope-manager": "8.32.1",
-        "@typescript-eslint/types": "8.32.1",
-        "@typescript-eslint/typescript-estree": "8.32.1"
+        "@typescript-eslint/scope-manager": "8.40.0",
+        "@typescript-eslint/types": "8.40.0",
+        "@typescript-eslint/typescript-estree": "8.40.0"
       },
       "engines": {
         "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@@ -4171,18 +4266,18 @@
       },
       "peerDependencies": {
         "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <5.9.0"
+        "typescript": ">=4.8.4 <6.0.0"
       }
     },
     "node_modules/@typescript-eslint/visitor-keys": {
-      "version": "8.32.1",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.32.1.tgz",
-      "integrity": "sha512-ar0tjQfObzhSaW3C3QNmTc5ofj0hDoNQ5XWrCy6zDyabdr0TWhCkClp+rywGNj/odAFBVzzJrK4tEq5M4Hmu4w==",
+      "version": "8.40.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.40.0.tgz",
+      "integrity": "sha512-8CZ47QwalyRjsypfwnbI3hKy5gJDPmrkLjkgMxhi0+DZZ2QNx2naS6/hWoVYUHU7LU2zleF68V9miaVZvhFfTA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@typescript-eslint/types": "8.32.1",
-        "eslint-visitor-keys": "^4.2.0"
+        "@typescript-eslint/types": "8.40.0",
+        "eslint-visitor-keys": "^4.2.1"
       },
       "engines": {
         "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@@ -6381,9 +6476,9 @@
       }
     },
     "node_modules/eslint-config-prettier": {
-      "version": "10.1.5",
-      "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.5.tgz",
-      "integrity": "sha512-zc1UmCpNltmVY34vuLRV61r1K27sWuX39E+uyUnY8xS2Bex88VV9cugG+UZbRSRGtGyFboj+D8JODyme1plMpw==",
+      "version": "10.1.8",
+      "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.8.tgz",
+      "integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==",
       "dev": true,
       "license": "MIT",
       "bin": {
@@ -6688,9 +6783,9 @@
       }
     },
     "node_modules/eslint-visitor-keys": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.0.tgz",
-      "integrity": "sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==",
+      "version": "4.2.1",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
+      "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
       "dev": true,
       "license": "Apache-2.0",
       "engines": {
@@ -9926,9 +10021,9 @@
       "license": "MIT"
     },
     "node_modules/llama-stack-client": {
-      "version": "0.2.17",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.17.tgz",
-      "integrity": "sha512-+/fEO8M7XPiVLjhH7ge18i1ijKp4+h3dOkE0C8g2cvGuDUtDYIJlf8NSyr9OMByjiWpCibWU7VOKL50LwGLS3Q==",
+      "version": "0.2.18",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.18.tgz",
+      "integrity": "sha512-k+xQOz/TIU0cINP4Aih8q6xs7f/6qs0fLDMXTTKQr5C0F1jtCjRiwsas7bTsDfpKfYhg/7Xy/wPw/uZgi6aIVg==",
       "license": "MIT",
       "dependencies": {
         "@types/node": "^18.11.18",
@@ -13489,9 +13584,9 @@
       }
     },
     "node_modules/tailwind-merge": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.0.tgz",
-      "integrity": "sha512-fyW/pEfcQSiigd5SNn0nApUOxx0zB/dm6UDU/rEwc2c3sX2smWUNbapHv+QRqLGVp9GWX3THIa7MUGPo+YkDzQ==",
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.1.tgz",
+      "integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
       "license": "MIT",
       "funding": {
         "type": "github",
@@ -13881,9 +13976,9 @@
       }
     },
     "node_modules/typescript": {
-      "version": "5.8.3",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
-      "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
+      "version": "5.9.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.2.tgz",
+      "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==",
       "dev": true,
       "license": "Apache-2.0",
       "bin": {
diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json
index fd6f6fbb7..7b4208aff 100644
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@@ -13,17 +13,17 @@
     "test:e2e": "playwright test"
   },
   "dependencies": {
-    "@radix-ui/react-collapsible": "^1.1.11",
+    "@radix-ui/react-collapsible": "^1.1.12",
     "@radix-ui/react-dialog": "^1.1.13",
     "@radix-ui/react-dropdown-menu": "^2.1.14",
     "@radix-ui/react-select": "^2.2.5",
-    "@radix-ui/react-separator": "^1.1.6",
+    "@radix-ui/react-separator": "^1.1.7",
     "@radix-ui/react-slot": "^1.2.3",
     "@radix-ui/react-tooltip": "^1.2.6",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
     "framer-motion": "^11.18.2",
-    "llama-stack-client": "^0.2.17",
+    "llama-stack-client": "^0.2.18",
     "lucide-react": "^0.510.0",
     "next": "15.3.3",
     "next-auth": "^4.24.11",
@@ -35,7 +35,7 @@
     "remeda": "^2.26.1",
     "shiki": "^1.29.2",
     "sonner": "^2.0.6",
-    "tailwind-merge": "^3.3.0"
+    "tailwind-merge": "^3.3.1"
   },
   "devDependencies": {
     "@eslint/eslintrc": "^3",
@@ -49,7 +49,7 @@
     "@types/react-dom": "^19",
     "eslint": "^9",
     "eslint-config-next": "15.3.2",
-    "eslint-config-prettier": "^10.1.5",
+    "eslint-config-prettier": "^10.1.8",
     "eslint-plugin-prettier": "^5.4.0",
     "jest": "^29.7.0",
     "jest-environment-jsdom": "^29.7.0",
diff --git a/llama_stack/ui/public/favicon.ico b/llama_stack/ui/public/favicon.ico
new file mode 100644
index 000000000..553368b18
Binary files /dev/null and b/llama_stack/ui/public/favicon.ico differ
diff --git a/llama_stack/ui/public/logo.webp b/llama_stack/ui/public/logo.webp
new file mode 100644
index 000000000..28caa6edd
Binary files /dev/null and b/llama_stack/ui/public/logo.webp differ
diff --git a/pyproject.toml b/pyproject.toml
index a918c3e36..6c76da895 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ required-version = ">=0.7.0"
 
 [project]
 name = "llama_stack"
-version = "0.2.17"
+version = "0.2.18"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -31,7 +31,7 @@ dependencies = [
     "huggingface-hub>=0.34.0,<1.0",
     "jinja2>=3.1.6",
     "jsonschema",
-    "llama-stack-client>=0.2.17",
+    "llama-stack-client>=0.2.18",
     "llama-api-client>=0.1.2",
     "openai>=1.99.6,<1.100.0",
     "prompt-toolkit",
@@ -56,7 +56,7 @@ dependencies = [
 ui = [
     "streamlit",
     "pandas",
-    "llama-stack-client>=0.2.17",
+    "llama-stack-client>=0.2.18",
     "streamlit-option-menu",
 ]
 
@@ -98,6 +98,7 @@ unit = [
     "together",
     "coverage",
     "chromadb>=1.0.15",
+    "moto[s3]>=5.1.10",
 ]
 # These are the core dependencies required for running integration tests. They are shared across all
 # providers. If a provider requires additional dependencies, please add them to your environment
diff --git a/scripts/provider_codegen.py b/scripts/provider_codegen.py
index 060acfa72..17efa2138 100755
--- a/scripts/provider_codegen.py
+++ b/scripts/provider_codegen.py
@@ -157,12 +157,14 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
         }
 
 
-def generate_provider_docs(provider_spec: Any, api_name: str) -> str:
+def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
     """Generate markdown documentation for a provider."""
     provider_type = provider_spec.provider_type
     config_class = provider_spec.config_class
 
     config_info = get_config_class_info(config_class)
+    if "error" in config_info:
+        progress.print(config_info["error"])
 
     md_lines = []
     md_lines.append(f"# {provider_type}")
@@ -295,7 +297,7 @@ def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> N
                 filename = provider_type.replace("::", "_").replace(":", "_")
                 provider_doc_file = doc_output_dir / f"{filename}.md"
 
-                provider_docs = generate_provider_docs(provider, api_name)
+                provider_docs = generate_provider_docs(progress, provider, api_name)
 
                 provider_doc_file.write_text(provider_docs)
                 change_tracker.add_paths(provider_doc_file)
diff --git a/scripts/run-ui-linter.sh b/scripts/run-ui-linter.sh
new file mode 100755
index 000000000..3ced4483b
--- /dev/null
+++ b/scripts/run-ui-linter.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -e
+cd llama_stack/ui
+
+if [ ! -d node_modules ] || [ ! -x node_modules/.bin/prettier ] || [ ! -x node_modules/.bin/eslint ]; then
+  echo "UI dependencies not installed, skipping prettier/linter check"
+  exit 0
+fi
+
+npm run format
+npm run lint
diff --git a/tests/integration/batches/test_batches_idempotency.py b/tests/integration/batches/test_batches_idempotency.py
new file mode 100644
index 000000000..b101bb3dc
--- /dev/null
+++ b/tests/integration/batches/test_batches_idempotency.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Integration tests for batch idempotency functionality using the OpenAI client library.
+
+This module tests the idempotency feature in the batches API using the OpenAI-compatible
+client interface. These tests verify that the idempotency key (idempotency_key) works correctly
+in a real client-server environment.
+
+Test Categories:
+1. Successful Idempotency: Same key returns same batch with identical parameters
+   - test_idempotent_batch_creation_successful: Verifies that requests with the same
+     idempotency key return identical batches, even with different metadata order
+
+2. Conflict Detection: Same key with conflicting parameters raises HTTP 409 errors
+   - test_idempotency_conflict_with_different_params: Verifies that reusing an idempotency key
+     with truly conflicting parameters (both file ID and metadata values) raises ConflictError
+"""
+
+import time
+
+import pytest
+from openai import ConflictError
+
+
+class TestBatchesIdempotencyIntegration:
+    """Integration tests for batch idempotency using OpenAI client."""
+
+    def test_idempotent_batch_creation_successful(self, openai_client):
+        """Test that identical requests with same idempotency key return the same batch."""
+        batch1 = openai_client.batches.create(
+            input_file_id="bogus-id",
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={
+                "test_type": "idempotency_success",
+                "purpose": "integration_test",
+            },
+            extra_body={"idempotency_key": "test-idempotency-token-1"},
+        )
+
+        # sleep to ensure different timestamps
+        time.sleep(1)
+
+        batch2 = openai_client.batches.create(
+            input_file_id="bogus-id",
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={
+                "purpose": "integration_test",
+                "test_type": "idempotency_success",
+            },  # Different order
+            extra_body={"idempotency_key": "test-idempotency-token-1"},
+        )
+
+        assert batch1.id == batch2.id
+        assert batch1.input_file_id == batch2.input_file_id
+        assert batch1.endpoint == batch2.endpoint
+        assert batch1.completion_window == batch2.completion_window
+        assert batch1.metadata == batch2.metadata
+        assert batch1.created_at == batch2.created_at
+
+    def test_idempotency_conflict_with_different_params(self, openai_client):
+        """Test that using same idempotency key with different params raises conflict error."""
+        batch1 = openai_client.batches.create(
+            input_file_id="bogus-id-1",
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={"test_type": "conflict_test_1"},
+            extra_body={"idempotency_key": "conflict-token"},
+        )
+
+        with pytest.raises(ConflictError) as exc_info:
+            openai_client.batches.create(
+                input_file_id="bogus-id-2",  # Different file ID
+                endpoint="/v1/chat/completions",
+                completion_window="24h",
+                metadata={"test_type": "conflict_test_2"},  # Different metadata
+                extra_body={"idempotency_key": "conflict-token"},  # Same token
+            )
+
+        assert exc_info.value.status_code == 409
+        assert "conflict" in str(exc_info.value).lower()
+
+        retrieved_batch = openai_client.batches.retrieve(batch1.id)
+        assert retrieved_batch.id == batch1.id
+        assert retrieved_batch.input_file_id == "bogus-id-1"
diff --git a/tests/integration/files/test_files.py b/tests/integration/files/test_files.py
index b17c7db83..67351d4f7 100644
--- a/tests/integration/files/test_files.py
+++ b/tests/integration/files/test_files.py
@@ -8,20 +8,27 @@ from io import BytesIO
 from unittest.mock import patch
 
 import pytest
-from openai import OpenAI
 
 from llama_stack.core.datatypes import User
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
 
 
-def test_openai_client_basic_operations(compat_client, client_with_models):
+# a fixture to skip all these tests if a files provider is not available
+@pytest.fixture(autouse=True)
+def skip_if_no_files_provider(llama_stack_client):
+    if not [provider for provider in llama_stack_client.providers.list() if provider.api == "files"]:
+        pytest.skip("No files providers found")
+
+
+def test_openai_client_basic_operations(openai_client):
     """Test basic file operations through OpenAI client."""
-    if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI):
-        pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient")
-    client = compat_client
+    from openai import NotFoundError
+
+    client = openai_client
 
     test_content = b"files test content"
 
+    uploaded_file = None
+
     try:
         # Upload file using OpenAI client
         with BytesIO(test_content) as file_buffer:
@@ -31,6 +38,7 @@ def test_openai_client_basic_operations(compat_client, client_with_models):
         # Verify basic response structure
         assert uploaded_file.id.startswith("file-")
         assert hasattr(uploaded_file, "filename")
+        assert uploaded_file.filename == "openai_test.txt"
 
         # List files
         files_list = client.files.list()
@@ -43,37 +51,41 @@ def test_openai_client_basic_operations(compat_client, client_with_models):
 
         # Retrieve file content - OpenAI client returns httpx Response object
         content_response = client.files.content(uploaded_file.id)
-        # The response is an httpx Response object with .content attribute containing bytes
-        if isinstance(content_response, str):
-            # Llama Stack Client returns a str
-            # TODO: fix Llama Stack Client
-            content = bytes(content_response, "utf-8")
-        else:
-            content = content_response.content
-        assert content == test_content
+        assert content_response.content == test_content
 
         # Delete file
         delete_response = client.files.delete(uploaded_file.id)
         assert delete_response.deleted is True
 
-    except Exception as e:
-        # Cleanup in case of failure
-        try:
+        # Retrieve file should fail
+        with pytest.raises(NotFoundError, match="not found"):
+            client.files.retrieve(uploaded_file.id)
+
+        # File should not be found in listing
+        files_list = client.files.list()
+        file_ids = [f.id for f in files_list.data]
+        assert uploaded_file.id not in file_ids
+
+        # Double delete should fail
+        with pytest.raises(NotFoundError, match="not found"):
             client.files.delete(uploaded_file.id)
-        except Exception:
-            pass
-        raise e
+
+    finally:
+        # Cleanup in case of failure
+        if uploaded_file is not None:
+            try:
+                client.files.delete(uploaded_file.id)
+            except NotFoundError:
+                pass  # ignore 404
 
 
+@pytest.mark.xfail(message="User isolation broken for current providers, must be fixed.")
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
-def test_files_authentication_isolation(mock_get_authenticated_user, compat_client, client_with_models):
+def test_files_authentication_isolation(mock_get_authenticated_user, llama_stack_client):
     """Test that users can only access their own files."""
-    if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI):
-        pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient")
-    if not isinstance(client_with_models, LlamaStackAsLibraryClient):
-        pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)")
+    from llama_stack_client import NotFoundError
 
-    client = compat_client
+    client = llama_stack_client
 
     # Create two test users
     user1 = User("user1", {"roles": ["user"], "teams": ["team-a"]})
@@ -117,7 +129,7 @@ def test_files_authentication_isolation(mock_get_authenticated_user, compat_clie
 
         # User 1 cannot retrieve user2's file
         mock_get_authenticated_user.return_value = user1
-        with pytest.raises(ValueError, match="not found"):
+        with pytest.raises(NotFoundError, match="not found"):
             client.files.retrieve(user2_file.id)
 
         # User 1 can access their file content
@@ -131,7 +143,7 @@ def test_files_authentication_isolation(mock_get_authenticated_user, compat_clie
 
         # User 1 cannot access user2's file content
         mock_get_authenticated_user.return_value = user1
-        with pytest.raises(ValueError, match="not found"):
+        with pytest.raises(NotFoundError, match="not found"):
             client.files.content(user2_file.id)
 
         # User 1 can delete their own file
@@ -141,7 +153,7 @@ def test_files_authentication_isolation(mock_get_authenticated_user, compat_clie
 
         # User 1 cannot delete user2's file
         mock_get_authenticated_user.return_value = user1
-        with pytest.raises(ValueError, match="not found"):
+        with pytest.raises(NotFoundError, match="not found"):
             client.files.delete(user2_file.id)
 
         # User 2 can still access their file after user1's file is deleted
@@ -169,14 +181,9 @@ def test_files_authentication_isolation(mock_get_authenticated_user, compat_clie
 
 
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
-def test_files_authentication_shared_attributes(mock_get_authenticated_user, compat_client, client_with_models):
+def test_files_authentication_shared_attributes(mock_get_authenticated_user, llama_stack_client):
     """Test access control with users having identical attributes."""
-    if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI):
-        pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient")
-    if not isinstance(client_with_models, LlamaStackAsLibraryClient):
-        pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)")
-
-    client = compat_client
+    client = llama_stack_client
 
     # Create users with identical attributes (required for default policy)
     user_a = User("user-a", {"roles": ["user"], "teams": ["shared-team"]})
@@ -231,14 +238,8 @@ def test_files_authentication_shared_attributes(mock_get_authenticated_user, com
 
 
 @patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
-def test_files_authentication_anonymous_access(mock_get_authenticated_user, compat_client, client_with_models):
-    """Test anonymous user behavior when no authentication is present."""
-    if isinstance(client_with_models, LlamaStackAsLibraryClient) and isinstance(compat_client, OpenAI):
-        pytest.skip("OpenAI files are not supported when testing with LlamaStackAsLibraryClient")
-    if not isinstance(client_with_models, LlamaStackAsLibraryClient):
-        pytest.skip("Authentication tests require LlamaStackAsLibraryClient (library mode)")
-
-    client = compat_client
+def test_files_authentication_anonymous_access(mock_get_authenticated_user, llama_stack_client):
+    client = llama_stack_client
 
     # Simulate anonymous user (no authentication)
     mock_get_authenticated_user.return_value = None
diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index 0b7132d71..ee4c5755a 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -256,15 +256,25 @@ def instantiate_llama_stack_client(session):
         provider_data=get_provider_data(),
         skip_logger_removal=True,
     )
-    if not client.initialize():
-        raise RuntimeError("Initialization failed")
-
     return client
 
 
 @pytest.fixture(scope="session")
-def openai_client(client_with_models):
-    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+def require_server(llama_stack_client):
+    """
+    Skip test if no server is running.
+
+    We use the llama_stack_client to tell if a server was started or not.
+
+    We use this with openai_client because it relies on a running server.
+    """
+    if isinstance(llama_stack_client, LlamaStackAsLibraryClient):
+        pytest.skip("No server running")
+
+
+@pytest.fixture(scope="session")
+def openai_client(llama_stack_client, require_server):
+    base_url = f"{llama_stack_client.base_url}/v1/openai/v1"
     return OpenAI(base_url=base_url, api_key="fake")
 
 
diff --git a/tests/integration/inference/test_embedding.py b/tests/integration/inference/test_embedding.py
index 075f927f7..e592a6b14 100644
--- a/tests/integration/inference/test_embedding.py
+++ b/tests/integration/inference/test_embedding.py
@@ -55,7 +55,7 @@
 #
 
 import pytest
-from llama_stack_client import BadRequestError
+from llama_stack_client import BadRequestError as LlamaStackBadRequestError
 from llama_stack_client.types import EmbeddingsResponse
 from llama_stack_client.types.shared.interleaved_content import (
     ImageContentItem,
@@ -63,6 +63,9 @@ from llama_stack_client.types.shared.interleaved_content import (
     ImageContentItemImageURL,
     TextContentItem,
 )
+from openai import BadRequestError as OpenAIBadRequestError
+
+from llama_stack.core.library_client import LlamaStackAsLibraryClient
 
 DUMMY_STRING = "hello"
 DUMMY_STRING2 = "world"
@@ -203,7 +206,14 @@ def test_embedding_truncation_error(
 ):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    with pytest.raises(BadRequestError):
+    # Using LlamaStackClient from llama_stack_client will raise llama_stack_client.BadRequestError
+    # While using LlamaStackAsLibraryClient from llama_stack.distribution.library_client will raise the error that the backend raises
+    error_type = (
+        OpenAIBadRequestError
+        if isinstance(llama_stack_client, LlamaStackAsLibraryClient)
+        else LlamaStackBadRequestError
+    )
+    with pytest.raises(error_type):
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id,
             contents=[DUMMY_LONG_TEXT],
@@ -283,7 +293,8 @@ def test_embedding_text_truncation_error(
 ):
     if inference_provider_type not in SUPPORTED_PROVIDERS:
         pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    with pytest.raises(BadRequestError):
+    error_type = ValueError if isinstance(llama_stack_client, LlamaStackAsLibraryClient) else LlamaStackBadRequestError
+    with pytest.raises(error_type):
         llama_stack_client.inference.embeddings(
             model_id=embedding_model_id,
             contents=[DUMMY_STRING],
diff --git a/tests/integration/non_ci/responses/fixtures/fixtures.py b/tests/integration/non_ci/responses/fixtures/fixtures.py
index 62c4ae086..1783a5622 100644
--- a/tests/integration/non_ci/responses/fixtures/fixtures.py
+++ b/tests/integration/non_ci/responses/fixtures/fixtures.py
@@ -113,8 +113,6 @@ def openai_client(base_url, api_key, provider):
             raise ValueError(f"Invalid config for Llama Stack: {provider}, it must be of the form 'stack:<config>'")
         config = parts[1]
         client = LlamaStackAsLibraryClient(config, skip_logger_removal=True)
-        if not client.initialize():
-            raise RuntimeError("Initialization failed")
         return client
 
     return OpenAI(
diff --git a/tests/integration/non_ci/responses/test_basic_responses.py b/tests/integration/non_ci/responses/test_basic_responses.py
index a8106e593..17d50d348 100644
--- a/tests/integration/non_ci/responses/test_basic_responses.py
+++ b/tests/integration/non_ci/responses/test_basic_responses.py
@@ -7,8 +7,9 @@
 import time
 
 import pytest
-from fixtures.test_cases import basic_test_cases, image_test_cases, multi_turn_image_test_cases, multi_turn_test_cases
-from streaming_assertions import StreamingValidator
+
+from .fixtures.test_cases import basic_test_cases, image_test_cases, multi_turn_image_test_cases, multi_turn_test_cases
+from .streaming_assertions import StreamingValidator
 
 
 @pytest.mark.parametrize("case", basic_test_cases)
diff --git a/tests/integration/non_ci/responses/test_tool_responses.py b/tests/integration/non_ci/responses/test_tool_responses.py
index 33d109863..c5c9e6fc1 100644
--- a/tests/integration/non_ci/responses/test_tool_responses.py
+++ b/tests/integration/non_ci/responses/test_tool_responses.py
@@ -10,7 +10,12 @@ import os
 import httpx
 import openai
 import pytest
-from fixtures.test_cases import (
+
+from llama_stack import LlamaStackAsLibraryClient
+from llama_stack.core.datatypes import AuthenticationRequiredError
+from tests.common.mcp import dependency_tools, make_mcp_server
+
+from .fixtures.test_cases import (
     custom_tool_test_cases,
     file_search_test_cases,
     mcp_tool_test_cases,
@@ -18,12 +23,8 @@ from fixtures.test_cases import (
     multi_turn_tool_execution_test_cases,
     web_search_test_cases,
 )
-from helpers import new_vector_store, setup_mcp_tools, upload_file, wait_for_file_attachment
-from streaming_assertions import StreamingValidator
-
-from llama_stack import LlamaStackAsLibraryClient
-from llama_stack.core.datatypes import AuthenticationRequiredError
-from tests.common.mcp import dependency_tools, make_mcp_server
+from .helpers import new_vector_store, setup_mcp_tools, upload_file, wait_for_file_attachment
+from .streaming_assertions import StreamingValidator
 
 
 @pytest.mark.parametrize("case", web_search_test_cases)
@@ -195,6 +196,56 @@ def test_response_non_streaming_mcp_tool(compat_client, text_model_id, case):
         assert len(response.output) >= 3
 
 
+@pytest.mark.parametrize("case", mcp_tool_test_cases)
+def test_response_sequential_mcp_tool(compat_client, text_model_id, case):
+    if not isinstance(compat_client, LlamaStackAsLibraryClient):
+        pytest.skip("in-process MCP server is only supported in library client")
+
+    with make_mcp_server() as mcp_server_info:
+        tools = setup_mcp_tools(case.tools, mcp_server_info)
+
+        response = compat_client.responses.create(
+            model=text_model_id,
+            input=case.input,
+            tools=tools,
+            stream=False,
+        )
+
+        assert len(response.output) >= 3
+        list_tools = response.output[0]
+        assert list_tools.type == "mcp_list_tools"
+        assert list_tools.server_label == "localmcp"
+        assert len(list_tools.tools) == 2
+        assert {t.name for t in list_tools.tools} == {
+            "get_boiling_point",
+            "greet_everyone",
+        }
+
+        call = response.output[1]
+        assert call.type == "mcp_call"
+        assert call.name == "get_boiling_point"
+        assert json.loads(call.arguments) == {
+            "liquid_name": "myawesomeliquid",
+            "celsius": True,
+        }
+        assert call.error is None
+        assert "-100" in call.output
+
+        # sometimes the model will call the tool again, so we need to get the last message
+        message = response.output[-1]
+        text_content = message.content[0].text
+        assert "boiling point" in text_content.lower()
+
+        response2 = compat_client.responses.create(
+            model=text_model_id, input=case.input, tools=tools, stream=False, previous_response_id=response.id
+        )
+
+        assert len(response2.output) >= 1
+        message = response2.output[-1]
+        text_content = message.content[0].text
+        assert "boiling point" in text_content.lower()
+
+
 @pytest.mark.parametrize("case", custom_tool_test_cases)
 def test_response_non_streaming_custom_tool(compat_client, text_model_id, case):
     response = compat_client.responses.create(
@@ -209,6 +260,94 @@ def test_response_non_streaming_custom_tool(compat_client, text_model_id, case):
     assert response.output[0].name == "get_weather"
 
 
+@pytest.mark.parametrize("case", custom_tool_test_cases)
+def test_response_function_call_ordering_1(compat_client, text_model_id, case):
+    response = compat_client.responses.create(
+        model=text_model_id,
+        input=case.input,
+        tools=case.tools,
+        stream=False,
+    )
+    assert len(response.output) == 1
+    assert response.output[0].type == "function_call"
+    assert response.output[0].status == "completed"
+    assert response.output[0].name == "get_weather"
+    inputs = []
+    inputs.append(
+        {
+            "role": "user",
+            "content": case.input,
+        }
+    )
+    inputs.append(
+        {
+            "type": "function_call_output",
+            "output": "It is raining.",
+            "call_id": response.output[0].call_id,
+        }
+    )
+    response = compat_client.responses.create(
+        model=text_model_id, input=inputs, tools=case.tools, stream=False, previous_response_id=response.id
+    )
+    assert len(response.output) == 1
+
+
+def test_response_function_call_ordering_2(compat_client, text_model_id):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get current temperature for a given location.",
+            "parameters": {
+                "additionalProperties": False,
+                "properties": {
+                    "location": {
+                        "description": "City and country e.g. Bogotá, Colombia",
+                        "type": "string",
+                    }
+                },
+                "required": ["location"],
+                "type": "object",
+            },
+        }
+    ]
+    inputs = [
+        {
+            "role": "user",
+            "content": "Is the weather better in San Francisco or Los Angeles?",
+        }
+    ]
+    response = compat_client.responses.create(
+        model=text_model_id,
+        input=inputs,
+        tools=tools,
+        stream=False,
+    )
+    for output in response.output:
+        if output.type == "function_call" and output.status == "completed" and output.name == "get_weather":
+            inputs.append(output)
+    for output in response.output:
+        if output.type == "function_call" and output.status == "completed" and output.name == "get_weather":
+            weather = "It is raining."
+            if "Los Angeles" in output.arguments:
+                weather = "It is cloudy."
+            inputs.append(
+                {
+                    "type": "function_call_output",
+                    "output": weather,
+                    "call_id": output.call_id,
+                }
+            )
+    response = compat_client.responses.create(
+        model=text_model_id,
+        input=inputs,
+        tools=tools,
+        stream=False,
+    )
+    assert len(response.output) == 1
+    assert "Los Angeles" in response.output_text
+
+
 @pytest.mark.parametrize("case", multi_turn_tool_execution_test_cases)
 def test_response_non_streaming_multi_turn_tool_execution(compat_client, text_model_id, case):
     """Test multi-turn tool execution where multiple MCP tool calls are performed in sequence."""
diff --git a/tests/integration/post_training/test_post_training.py b/tests/integration/post_training/test_post_training.py
index f9c797593..b5be71c7c 100644
--- a/tests/integration/post_training/test_post_training.py
+++ b/tests/integration/post_training/test_post_training.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import sys
 import time
 import uuid
@@ -19,10 +18,10 @@ from llama_stack.apis.post_training import (
     LoraFinetuningConfig,
     TrainingConfig,
 )
+from llama_stack.log import get_logger
 
 # Configure logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", force=True)
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="post_training")
 
 
 skip_because_resource_intensive = pytest.mark.skip(
diff --git a/tests/integration/recordings/index.sqlite b/tests/integration/recordings/index.sqlite
index 5997194a4..0c88416f1 100644
Binary files a/tests/integration/recordings/index.sqlite and b/tests/integration/recordings/index.sqlite differ
diff --git a/tests/integration/recordings/responses/390f0c7dac96.json b/tests/integration/recordings/responses/390f0c7dac96.json
new file mode 100644
index 000000000..e8c9528fb
--- /dev/null
+++ b/tests/integration/recordings/responses/390f0c7dac96.json
@@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": false
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.GenerateResponse",
+      "__data__": {
+        "model": "llama3.2:3b-instruct-fp16",
+        "created_at": "2025-08-11T15:51:18.170868Z",
+        "done": true,
+        "done_reason": "stop",
+        "total_duration": 5240614083,
+        "load_duration": 9823416,
+        "prompt_eval_count": 21,
+        "prompt_eval_duration": 21000000,
+        "eval_count": 310,
+        "eval_duration": 5209000000,
+        "response": "This is the start of a test. I'll provide some sample data and you can try to generate metrics based on it.\n\n**Data:**\n\nLet's say we have a dataset of user interactions with an e-commerce website. The data includes:\n\n| User ID | Product Name | Purchase Date | Quantity | Price |\n| --- | --- | --- | --- | --- |\n| 1 | iPhone 13 | 2022-01-01 | 2 | 999.99 |\n| 1 | MacBook Air | 2022-01-05 | 1 | 1299.99 |\n| 2 | Samsung TV | 2022-01-10 | 3 | 899.99 |\n| 3 | iPhone 13 | 2022-01-15 | 1 | 999.99 |\n| 4 | MacBook Pro | 2022-01-20 | 2 | 1799.99 |\n\n**Task:**\n\nYour task is to generate the following metrics based on this data:\n\n1. Average order value (AOV)\n2. Conversion rate\n3. Average revenue per user (ARPU)\n4. Customer lifetime value (CLV)\n\nPlease provide your answers in a format like this:\n\n| Metric | Value |\n| --- | --- |\n| AOV | 1234.56 |\n| Conversion Rate | 0.25 |\n| ARPU | 1000.00 |\n| CLV | 5000.00 |\n\nGo ahead and generate the metrics!",
+        "thinking": null,
+        "context": null
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/4de6877d86fa.json b/tests/integration/recordings/responses/4de6877d86fa.json
new file mode 100644
index 000000000..b30c7c451
--- /dev/null
+++ b/tests/integration/recordings/responses/4de6877d86fa.json
@@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "OpenAI test 0"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-843",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I don't have any information about an \"OpenAI test 0\". It's possible that you may be referring to a specific experiment or task being performed by OpenAI, but without more context, I can only speculate.\n\nHowever, I can tell you that OpenAI is a research organization that has been involved in various projects and tests related to artificial intelligence. If you could provide more context or clarify what you're referring to, I may be able to help further.\n\nIf you're looking for general information about OpenAI, I can try to provide some background on the organization:\n\nOpenAI is a non-profit research organization that was founded in 2015 with the goal of developing and applying advanced artificial intelligence to benefit humanity. The organization has made significant contributions to the field of AI, including the development of the popular language model, ChatGPT.\n\nIf you could provide more context or clarify what you're looking for, I'll do my best to assist you.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1755891518,
+        "model": "llama3.2:3b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 194,
+          "prompt_tokens": 30,
+          "total_tokens": 224,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/5db0c44c83a4.json b/tests/integration/recordings/responses/5db0c44c83a4.json
new file mode 100644
index 000000000..058478a11
--- /dev/null
+++ b/tests/integration/recordings/responses/5db0c44c83a4.json
@@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "OpenAI test 1"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-726",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm ready to help with the test. What language would you like to use? Would you like to have a conversation, ask questions, or take a specific type of task?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1755891519,
+        "model": "llama3.2:3b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 37,
+          "prompt_tokens": 30,
+          "total_tokens": 67,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/6cb0285a7638.json b/tests/integration/recordings/responses/6cb0285a7638.json
new file mode 100644
index 000000000..60ad9f66d
--- /dev/null
+++ b/tests/integration/recordings/responses/6cb0285a7638.json
@@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "OpenAI test 4"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-581",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm ready to help. What would you like to test? We could try a variety of things, such as:\n\n1. Conversational dialogue\n2. Language understanding\n3. Common sense reasoning\n4. Joke or pun generation\n5. Trivia or knowledge-based questions\n6. Creative writing or storytelling\n7. Summarization or paraphrasing\n\nLet me know which area you'd like to test, or suggest something else that's on your mind!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1755891527,
+        "model": "llama3.2:3b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 96,
+          "prompt_tokens": 30,
+          "total_tokens": 126,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/7bcb0f86c91b.json b/tests/integration/recordings/responses/7bcb0f86c91b.json
new file mode 100644
index 000000000..4c9a55153
--- /dev/null
+++ b/tests/integration/recordings/responses/7bcb0f86c91b.json
@@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": false
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.GenerateResponse",
+      "__data__": {
+        "model": "llama3.2:3b-instruct-fp16",
+        "created_at": "2025-08-11T15:51:12.918723Z",
+        "done": true,
+        "done_reason": "stop",
+        "total_duration": 8868987792,
+        "load_duration": 2793275292,
+        "prompt_eval_count": 21,
+        "prompt_eval_duration": 250000000,
+        "eval_count": 344,
+        "eval_duration": 5823000000,
+        "response": "Here are some common test metrics used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions or classifications out of total predictions made.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to the sum of true positives and false positives (incorrectly predicted instances).\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: The square root of the mean of the squared percentage differences between predicted and actual values.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well a model fits the data, with higher values indicating better fit.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Normalized Mean Squared Error (NMSE)**: Similar to MSE, but normalized by the mean of the actual values.\n\nThese metrics can be used for various types of data, including:\n\n* Regression problems (e.g., predicting continuous values)\n* Classification problems (e.g., predicting categorical labels)\n* Time series forecasting\n* Clustering and dimensionality reduction\n\nWhen choosing a metric, consider the specific problem you're trying to solve, the type of data, and the desired level of precision.",
+        "thinking": null,
+        "context": null
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/bf79a89cc37f.json b/tests/integration/recordings/responses/bf79a89cc37f.json
new file mode 100644
index 000000000..2373c1d6a
--- /dev/null
+++ b/tests/integration/recordings/responses/bf79a89cc37f.json
@@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "OpenAI test 3"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-48",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help, but it seems you want me to engage in a basic conversation as OpenAI's new chat model, right? I can do that!\n\nHere's my response:\n\nHello! How are you today? Is there something specific on your mind that you'd like to talk about or any particular topic you'd like to explore together?\n\nWhat is it that you're curious about?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1755891524,
+        "model": "llama3.2:3b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 80,
+          "prompt_tokens": 30,
+          "total_tokens": 110,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/c31a86ea6c58.json b/tests/integration/recordings/responses/c31a86ea6c58.json
new file mode 100644
index 000000000..b8d109ddd
--- /dev/null
+++ b/tests/integration/recordings/responses/c31a86ea6c58.json
@@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": false
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.GenerateResponse",
+      "__data__": {
+        "model": "llama3.2:3b",
+        "created_at": "2025-08-11T15:56:06.703788Z",
+        "done": true,
+        "done_reason": "stop",
+        "total_duration": 2722294000,
+        "load_duration": 9736083,
+        "prompt_eval_count": 21,
+        "prompt_eval_duration": 113000000,
+        "eval_count": 324,
+        "eval_duration": 2598000000,
+        "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions made by the model.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to total positive predictions.\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses the error as a percentage.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2, MSLE\n* Time series forecasting: MAPE, RMSPE\n\nNote that the choice of metric depends on the specific problem and data.",
+        "thinking": null,
+        "context": null
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/dc8120cf0774.json b/tests/integration/recordings/responses/dc8120cf0774.json
new file mode 100644
index 000000000..cf6b8c4d3
--- /dev/null
+++ b/tests/integration/recordings/responses/dc8120cf0774.json
@@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "OpenAI test 2"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-516",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help with your question or task. Please go ahead and ask me anything, and I'll do my best to assist you.\n\nNote: I'll be using the latest version of my knowledge cutoff, which is December 2023.\n\nAlso, please keep in mind that I'm a large language model, I can provide information on a broad range of topics, including science, history, technology, culture, and more. However, my ability to understand and respond to specific questions or requests may be limited by the data I've been trained on.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1755891522,
+        "model": "llama3.2:3b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 113,
+          "prompt_tokens": 30,
+          "total_tokens": 143,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/f6857bcea729.json b/tests/integration/recordings/responses/f6857bcea729.json
new file mode 100644
index 000000000..404bfb987
--- /dev/null
+++ b/tests/integration/recordings/responses/f6857bcea729.json
@@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 2<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": false
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.GenerateResponse",
+      "__data__": {
+        "model": "llama3.2:3b",
+        "created_at": "2025-08-11T15:56:13.082679Z",
+        "done": true,
+        "done_reason": "stop",
+        "total_duration": 2606245291,
+        "load_duration": 9979708,
+        "prompt_eval_count": 21,
+        "prompt_eval_duration": 23000000,
+        "eval_count": 321,
+        "eval_duration": 2572000000,
+        "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: Measures how close the predicted values are to the actual values.\n2. **Precision**: Measures the proportion of true positives among all positive predictions made by the model.\n3. **Recall**: Measures the proportion of true positives among all actual positive instances.\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses errors as a percentage of the actual value.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: Measures the average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2\n* Time series forecasting: MAPE, MSLE\n\nNote that the choice of metric depends on the specific problem and data.",
+        "thinking": null,
+        "context": null
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/recordings/responses/f80b99430f7e.json b/tests/integration/recordings/responses/f80b99430f7e.json
new file mode 100644
index 000000000..5b692f4ca
--- /dev/null
+++ b/tests/integration/recordings/responses/f80b99430f7e.json
@@ -0,0 +1,39 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/generate",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b",
+      "raw": true,
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+      "options": {
+        "temperature": 0.0
+      },
+      "stream": false
+    },
+    "endpoint": "/api/generate",
+    "model": "llama3.2:3b"
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.GenerateResponse",
+      "__data__": {
+        "model": "llama3.2:3b",
+        "created_at": "2025-08-11T15:56:10.465932Z",
+        "done": true,
+        "done_reason": "stop",
+        "total_duration": 3745686709,
+        "load_duration": 9734584,
+        "prompt_eval_count": 21,
+        "prompt_eval_duration": 23000000,
+        "eval_count": 457,
+        "eval_duration": 3712000000,
+        "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n**Primary Metrics**\n\n1. **Response Time**: The time it takes for the system to respond to a request.\n2. **Throughput**: The number of requests processed by the system per unit time (e.g., requests per second).\n3. **Error Rate**: The percentage of requests that result in an error.\n\n**Secondary Metrics**\n\n1. **Average Response Time**: The average response time for all requests.\n2. **Median Response Time**: The middle value of the response times, used to detect outliers.\n3. **99th Percentile Response Time**: The response time at which 99% of requests are completed within this time.\n4. **Request Latency**: The difference between the request arrival time and the response time.\n\n**User Experience Metrics**\n\n1. **User Satisfaction (USAT)**: Measured through surveys or feedback forms to gauge user satisfaction with the system's performance.\n2. **First Response Time**: The time it takes for a user to receive their first response from the system.\n3. **Time Spent in System**: The total amount of time a user spends interacting with the system.\n\n**System Resource Metrics**\n\n1. **CPU Utilization**: The percentage of CPU resources being used by the system.\n2. **Memory Usage**: The amount of memory being used by the system.\n3. **Disk I/O Wait Time**: The average time spent waiting for disk I/O operations to complete.\n\n**Security Metrics**\n\n1. **Authentication Success Rate**: The percentage of successful authentication attempts.\n2. **Authorization Success Rate**: The percentage of successful authorization attempts.\n3. **Error Rate (Security)**: The percentage of security-related errors.\n\n**Other Metrics**\n\n1. **Page Load Time**: The time it takes for a page to load.\n2. **Click-Through Rate (CTR)**: The percentage of users who click on a link or button after seeing an ad or notification.\n3. **Conversion Rate**: The percentage of users who complete a desired action (e.g., fill out a form, make a purchase).\n\nThese metrics can be used to evaluate the performance and effectiveness of various aspects of your system, from user experience to security and resource utilization.",
+        "thinking": null,
+        "context": null
+      }
+    },
+    "is_streaming": false
+  }
+}
diff --git a/tests/integration/telemetry/test_telemetry_metrics.py b/tests/integration/telemetry/test_telemetry_metrics.py
new file mode 100644
index 000000000..4ba2bd2d9
--- /dev/null
+++ b/tests/integration/telemetry/test_telemetry_metrics.py
@@ -0,0 +1,209 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from datetime import UTC, datetime, timedelta
+
+import pytest
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id):
+    """Setup fixture that creates telemetry metrics data before tests run."""
+
+    # Skip OpenAI tests if running in library mode
+    if not hasattr(client_with_models, "base_url"):
+        pytest.skip("OpenAI client tests not supported with library client")
+
+    prompt_tokens = []
+    completion_tokens = []
+    total_tokens = []
+
+    # Create OpenAI completions to generate metrics using the proper OpenAI client
+    for i in range(5):
+        response = openai_client.chat.completions.create(
+            model=text_model_id,
+            messages=[{"role": "user", "content": f"OpenAI test {i}"}],
+            stream=False,
+        )
+        prompt_tokens.append(response.usage.prompt_tokens)
+        completion_tokens.append(response.usage.completion_tokens)
+        total_tokens.append(response.usage.total_tokens)
+
+    # Wait for metrics to be logged
+    start_time = time.time()
+    while time.time() - start_time < 30:
+        try:
+            # Try to query metrics to see if they're available
+            metrics_response = client_with_models.telemetry.query_metrics(
+                metric_name="completion_tokens",
+                start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()),
+            )
+            if len(metrics_response[0].values) > 0:
+                break
+        except Exception:
+            pass
+        time.sleep(1)
+
+    # Wait additional time to ensure all metrics are processed
+    time.sleep(5)
+
+    # Return the token lists for use in tests
+    return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+    """Test that prompt_tokens metrics are queryable."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    response = client_with_models.telemetry.query_metrics(
+        metric_name="prompt_tokens",
+        start_time=start_time,
+    )
+
+    assert isinstance(response, list)
+
+    assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+    assert response[0].metric == "prompt_tokens"
+
+    # Use the actual values from setup instead of hardcoded values
+    expected_values = setup_telemetry_metrics_data["prompt_tokens"]
+    assert response[0].values[-1].value in expected_values, (
+        f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+    )
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+    """Test that completion_tokens metrics are queryable."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    response = client_with_models.telemetry.query_metrics(
+        metric_name="completion_tokens",
+        start_time=start_time,
+    )
+
+    assert isinstance(response, list)
+
+    assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+    assert response[0].metric == "completion_tokens"
+
+    # Use the actual values from setup instead of hardcoded values
+    expected_values = setup_telemetry_metrics_data["completion_tokens"]
+    assert response[0].values[-1].value in expected_values, (
+        f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+    )
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+    """Test that total_tokens metrics are queryable."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    response = client_with_models.telemetry.query_metrics(
+        metric_name="total_tokens",
+        start_time=start_time,
+    )
+
+    assert isinstance(response, list)
+
+    assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+    assert response[0].metric == "total_tokens"
+
+    # Use the actual values from setup instead of hardcoded values
+    expected_values = setup_telemetry_metrics_data["total_tokens"]
+    assert response[0].values[-1].value in expected_values, (
+        f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+    )
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_with_time_range(llama_stack_client, text_model_id):
+    """Test that metrics are queryable with time range."""
+    end_time = int(datetime.now(UTC).timestamp())
+    start_time = end_time - 600  # 10 minutes ago
+
+    response = llama_stack_client.telemetry.query_metrics(
+        metric_name="prompt_tokens",
+        start_time=start_time,
+        end_time=end_time,
+    )
+
+    assert isinstance(response, list)
+
+    assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+    assert response[0].metric == "prompt_tokens"
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id):
+    """Test that metrics are queryable with label matchers."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    response = llama_stack_client.telemetry.query_metrics(
+        metric_name="prompt_tokens",
+        start_time=start_time,
+        label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}],
+    )
+
+    assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_nonexistent_metric(llama_stack_client):
+    """Test that querying a nonexistent metric returns empty data."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    response = llama_stack_client.telemetry.query_metrics(
+        metric_name="nonexistent_metric",
+        start_time=start_time,
+    )
+
+    assert isinstance(response, list), "Should return an empty list for nonexistent metric"
+    assert len(response) == 0
+
+
+@pytest.mark.skip(reason="Skipping this test until client is regenerated")
+def test_query_metrics_with_granularity(llama_stack_client, text_model_id):
+    """Test that metrics are queryable with different granularity levels."""
+    start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+    # Test hourly granularity
+    hourly_response = llama_stack_client.telemetry.query_metrics(
+        metric_name="total_tokens",
+        start_time=start_time,
+        granularity="1h",
+    )
+
+    # Test daily granularity
+    daily_response = llama_stack_client.telemetry.query_metrics(
+        metric_name="total_tokens",
+        start_time=start_time,
+        granularity="1d",
+    )
+
+    # Test no granularity (raw data points)
+    raw_response = llama_stack_client.telemetry.query_metrics(
+        metric_name="total_tokens",
+        start_time=start_time,
+        granularity=None,
+    )
+
+    # All should return valid data
+    assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data"
+    assert isinstance(daily_response[0].values, list), "Daily granularity should return data"
+    assert isinstance(raw_response[0].values, list), "No granularity should return data"
+
+    # Verify that different granularities produce different aggregation levels
+    # (The exact number depends on data distribution, but they should be queryable)
+    assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable"
+    assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable"
+    assert len(raw_response[0].values) >= 0, "No granularity should be queryable"
diff --git a/tests/integration/vector_io/test_openai_vector_stores.py b/tests/integration/vector_io/test_openai_vector_stores.py
index bead95c26..82868164f 100644
--- a/tests/integration/vector_io/test_openai_vector_stores.py
+++ b/tests/integration/vector_io/test_openai_vector_stores.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
 import time
 from io import BytesIO
 
@@ -14,8 +13,9 @@ from openai import BadRequestError as OpenAIBadRequestError
 
 from llama_stack.apis.vector_io import Chunk
 from llama_stack.core.library_client import LlamaStackAsLibraryClient
+from llama_stack.log import get_logger
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="vector_io")
 
 
 def skip_if_provider_doesnt_support_openai_vector_stores(client_with_models):
diff --git a/tests/unit/distribution/test_library_client_initialization.py b/tests/unit/distribution/test_library_client_initialization.py
index e510d513d..b7e7a1857 100644
--- a/tests/unit/distribution/test_library_client_initialization.py
+++ b/tests/unit/distribution/test_library_client_initialization.py
@@ -5,86 +5,121 @@
 # the root directory of this source tree.
 
 """
-Unit tests for LlamaStackAsLibraryClient initialization error handling.
+Unit tests for LlamaStackAsLibraryClient automatic initialization.
 
-These tests ensure that users get proper error messages when they forget to call
-initialize() on the library client, preventing AttributeError regressions.
+These tests ensure that the library client is automatically initialized
+and ready to use immediately after construction.
 """
 
-import pytest
-
 from llama_stack.core.library_client import (
     AsyncLlamaStackAsLibraryClient,
     LlamaStackAsLibraryClient,
 )
+from llama_stack.core.server.routes import RouteImpls
 
 
-class TestLlamaStackAsLibraryClientInitialization:
-    """Test proper error handling for uninitialized library clients."""
+class TestLlamaStackAsLibraryClientAutoInitialization:
+    """Test automatic initialization of library clients."""
 
-    @pytest.mark.parametrize(
-        "api_call",
-        [
-            lambda client: client.models.list(),
-            lambda client: client.chat.completions.create(model="test", messages=[{"role": "user", "content": "test"}]),
-            lambda client: next(
-                client.chat.completions.create(
-                    model="test", messages=[{"role": "user", "content": "test"}], stream=True
-                )
-            ),
-        ],
-        ids=["models.list", "chat.completions.create", "chat.completions.create_stream"],
-    )
-    def test_sync_client_proper_error_without_initialization(self, api_call):
-        """Test that sync client raises ValueError with helpful message when not initialized."""
-        client = LlamaStackAsLibraryClient("nvidia")
+    def test_sync_client_auto_initialization(self, monkeypatch):
+        """Test that sync client is automatically initialized after construction."""
+        # Mock the stack construction to avoid dependency issues
+        mock_impls = {}
+        mock_route_impls = RouteImpls({})
 
-        with pytest.raises(ValueError) as exc_info:
-            api_call(client)
+        async def mock_construct_stack(config, custom_provider_registry):
+            return mock_impls
 
-        error_msg = str(exc_info.value)
-        assert "Client not initialized" in error_msg
-        assert "Please call initialize() first" in error_msg
+        def mock_initialize_route_impls(impls):
+            return mock_route_impls
 
-    @pytest.mark.parametrize(
-        "api_call",
-        [
-            lambda client: client.models.list(),
-            lambda client: client.chat.completions.create(model="test", messages=[{"role": "user", "content": "test"}]),
-        ],
-        ids=["models.list", "chat.completions.create"],
-    )
-    async def test_async_client_proper_error_without_initialization(self, api_call):
-        """Test that async client raises ValueError with helpful message when not initialized."""
-        client = AsyncLlamaStackAsLibraryClient("nvidia")
+        monkeypatch.setattr("llama_stack.core.library_client.construct_stack", mock_construct_stack)
+        monkeypatch.setattr("llama_stack.core.library_client.initialize_route_impls", mock_initialize_route_impls)
 
-        with pytest.raises(ValueError) as exc_info:
-            await api_call(client)
+        client = LlamaStackAsLibraryClient("ci-tests")
 
-        error_msg = str(exc_info.value)
-        assert "Client not initialized" in error_msg
-        assert "Please call initialize() first" in error_msg
+        assert client.async_client.route_impls is not None
 
-    async def test_async_client_streaming_error_without_initialization(self):
-        """Test that async client streaming raises ValueError with helpful message when not initialized."""
-        client = AsyncLlamaStackAsLibraryClient("nvidia")
+    async def test_async_client_auto_initialization(self, monkeypatch):
+        """Test that async client can be initialized and works properly."""
+        # Mock the stack construction to avoid dependency issues
+        mock_impls = {}
+        mock_route_impls = RouteImpls({})
 
-        with pytest.raises(ValueError) as exc_info:
-            stream = await client.chat.completions.create(
-                model="test", messages=[{"role": "user", "content": "test"}], stream=True
-            )
-            await anext(stream)
+        async def mock_construct_stack(config, custom_provider_registry):
+            return mock_impls
 
-        error_msg = str(exc_info.value)
-        assert "Client not initialized" in error_msg
-        assert "Please call initialize() first" in error_msg
+        def mock_initialize_route_impls(impls):
+            return mock_route_impls
 
-    def test_route_impls_initialized_to_none(self):
-        """Test that route_impls is initialized to None to prevent AttributeError."""
-        # Test sync client
-        sync_client = LlamaStackAsLibraryClient("nvidia")
-        assert sync_client.async_client.route_impls is None
+        monkeypatch.setattr("llama_stack.core.library_client.construct_stack", mock_construct_stack)
+        monkeypatch.setattr("llama_stack.core.library_client.initialize_route_impls", mock_initialize_route_impls)
 
-        # Test async client directly
-        async_client = AsyncLlamaStackAsLibraryClient("nvidia")
-        assert async_client.route_impls is None
+        client = AsyncLlamaStackAsLibraryClient("ci-tests")
+
+        # Initialize the client
+        result = await client.initialize()
+        assert result is True
+        assert client.route_impls is not None
+
+    def test_initialize_method_backward_compatibility(self, monkeypatch):
+        """Test that initialize() method still works for backward compatibility."""
+        # Mock the stack construction to avoid dependency issues
+        mock_impls = {}
+        mock_route_impls = RouteImpls({})
+
+        async def mock_construct_stack(config, custom_provider_registry):
+            return mock_impls
+
+        def mock_initialize_route_impls(impls):
+            return mock_route_impls
+
+        monkeypatch.setattr("llama_stack.core.library_client.construct_stack", mock_construct_stack)
+        monkeypatch.setattr("llama_stack.core.library_client.initialize_route_impls", mock_initialize_route_impls)
+
+        client = LlamaStackAsLibraryClient("ci-tests")
+
+        result = client.initialize()
+        assert result is None
+
+        result2 = client.initialize()
+        assert result2 is None
+
+    async def test_async_initialize_method_idempotent(self, monkeypatch):
+        """Test that async initialize() method can be called multiple times safely."""
+        mock_impls = {}
+        mock_route_impls = RouteImpls({})
+
+        async def mock_construct_stack(config, custom_provider_registry):
+            return mock_impls
+
+        def mock_initialize_route_impls(impls):
+            return mock_route_impls
+
+        monkeypatch.setattr("llama_stack.core.library_client.construct_stack", mock_construct_stack)
+        monkeypatch.setattr("llama_stack.core.library_client.initialize_route_impls", mock_initialize_route_impls)
+
+        client = AsyncLlamaStackAsLibraryClient("ci-tests")
+
+        result1 = await client.initialize()
+        assert result1 is True
+
+        result2 = await client.initialize()
+        assert result2 is True
+
+    def test_route_impls_automatically_set(self, monkeypatch):
+        """Test that route_impls is automatically set during construction."""
+        mock_impls = {}
+        mock_route_impls = RouteImpls({})
+
+        async def mock_construct_stack(config, custom_provider_registry):
+            return mock_impls
+
+        def mock_initialize_route_impls(impls):
+            return mock_route_impls
+
+        monkeypatch.setattr("llama_stack.core.library_client.construct_stack", mock_construct_stack)
+        monkeypatch.setattr("llama_stack.core.library_client.initialize_route_impls", mock_initialize_route_impls)
+
+        sync_client = LlamaStackAsLibraryClient("ci-tests")
+        assert sync_client.async_client.route_impls is not None
diff --git a/tests/unit/files/test_files.py b/tests/unit/files/test_files.py
index 04f33e97d..e14e033b9 100644
--- a/tests/unit/files/test_files.py
+++ b/tests/unit/files/test_files.py
@@ -7,6 +7,7 @@
 
 import pytest
 
+from llama_stack.apis.common.errors import ResourceNotFoundError
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import OpenAIFilePurpose
 from llama_stack.core.access_control.access_control import default_policy
@@ -190,7 +191,7 @@ class TestOpenAIFilesAPI:
 
     async def test_retrieve_file_not_found(self, files_provider):
         """Test retrieving a non-existent file."""
-        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+        with pytest.raises(ResourceNotFoundError, match="not found"):
             await files_provider.openai_retrieve_file("file-nonexistent")
 
     async def test_retrieve_file_content_success(self, files_provider, sample_text_file):
@@ -208,7 +209,7 @@ class TestOpenAIFilesAPI:
 
     async def test_retrieve_file_content_not_found(self, files_provider):
         """Test retrieving content of a non-existent file."""
-        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+        with pytest.raises(ResourceNotFoundError, match="not found"):
             await files_provider.openai_retrieve_file_content("file-nonexistent")
 
     async def test_delete_file_success(self, files_provider, sample_text_file):
@@ -229,12 +230,12 @@ class TestOpenAIFilesAPI:
         assert delete_response.deleted is True
 
         # Verify file no longer exists
-        with pytest.raises(ValueError, match=f"File with id {uploaded_file.id} not found"):
+        with pytest.raises(ResourceNotFoundError, match="not found"):
             await files_provider.openai_retrieve_file(uploaded_file.id)
 
     async def test_delete_file_not_found(self, files_provider):
         """Test deleting a non-existent file."""
-        with pytest.raises(ValueError, match="File with id file-nonexistent not found"):
+        with pytest.raises(ResourceNotFoundError, match="not found"):
             await files_provider.openai_delete_file("file-nonexistent")
 
     async def test_file_persistence_across_operations(self, files_provider, sample_text_file):
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
index 5ea14d7c7..a964bc219 100644
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
     OpenAIResponseMessage,
     OpenAIResponseObjectWithInput,
     OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageMCPCall,
     OpenAIResponseOutputMessageWebSearchToolCall,
     OpenAIResponseText,
     OpenAIResponseTextFormat,
@@ -461,6 +462,53 @@ async def test_prepend_previous_response_web_search(openai_responses_impl, mock_
     assert input[3].content == "fake_input"
 
 
+async def test_prepend_previous_response_mcp_tool_call(openai_responses_impl, mock_responses_store):
+    """Test prepending a previous response which included an mcp tool call to a new response."""
+    input_item_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseInputMessageContentText(text="fake_previous_input")],
+        role="user",
+    )
+    output_tool_call = OpenAIResponseOutputMessageMCPCall(
+        id="ws_123",
+        name="fake-tool",
+        arguments="fake-arguments",
+        server_label="fake-label",
+    )
+    output_message = OpenAIResponseMessage(
+        id="123",
+        content=[OpenAIResponseOutputMessageContentOutputText(text="fake_tool_call_response")],
+        status="completed",
+        role="assistant",
+    )
+    response = OpenAIResponseObjectWithInput(
+        created_at=1,
+        id="resp_123",
+        model="fake_model",
+        output=[output_tool_call, output_message],
+        status="completed",
+        text=OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")),
+        input=[input_item_message],
+    )
+    mock_responses_store.get_response_object.return_value = response
+
+    input_messages = [OpenAIResponseMessage(content="fake_input", role="user")]
+    input = await openai_responses_impl._prepend_previous_response(input_messages, "resp_123")
+
+    assert len(input) == 4
+    # Check for previous input
+    assert isinstance(input[0], OpenAIResponseMessage)
+    assert input[0].content[0].text == "fake_previous_input"
+    # Check for previous output MCP tool call
+    assert isinstance(input[1], OpenAIResponseOutputMessageMCPCall)
+    # Check for previous output web search response
+    assert isinstance(input[2], OpenAIResponseMessage)
+    assert input[2].content[0].text == "fake_tool_call_response"
+    # Check for new input
+    assert isinstance(input[3], OpenAIResponseMessage)
+    assert input[3].content == "fake_input"
+
+
 async def test_create_openai_response_with_instructions(openai_responses_impl, mock_inference_api):
     # Setup
     input_text = "What is the capital of Ireland?"
diff --git a/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py b/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py
index b568ce135..187540f82 100644
--- a/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py
+++ b/tests/unit/providers/agents/meta_reference/test_response_conversion_utils.py
@@ -45,7 +45,6 @@ from llama_stack.providers.inline.agents.meta_reference.responses.utils import (
 
 
 class TestConvertChatChoiceToResponseMessage:
-    @pytest.mark.asyncio
     async def test_convert_string_content(self):
         choice = OpenAIChoice(
             message=OpenAIAssistantMessageParam(content="Test message"),
@@ -61,7 +60,6 @@ class TestConvertChatChoiceToResponseMessage:
         assert isinstance(result.content[0], OpenAIResponseOutputMessageContentOutputText)
         assert result.content[0].text == "Test message"
 
-    @pytest.mark.asyncio
     async def test_convert_text_param_content(self):
         choice = OpenAIChoice(
             message=OpenAIAssistantMessageParam(
@@ -78,12 +76,10 @@ class TestConvertChatChoiceToResponseMessage:
 
 
 class TestConvertResponseContentToChatContent:
-    @pytest.mark.asyncio
     async def test_convert_string_content(self):
         result = await convert_response_content_to_chat_content("Simple string")
         assert result == "Simple string"
 
-    @pytest.mark.asyncio
     async def test_convert_text_content_parts(self):
         content = [
             OpenAIResponseInputMessageContentText(text="First part"),
@@ -98,7 +94,6 @@ class TestConvertResponseContentToChatContent:
         assert isinstance(result[1], OpenAIChatCompletionContentPartTextParam)
         assert result[1].text == "Second part"
 
-    @pytest.mark.asyncio
     async def test_convert_image_content(self):
         content = [OpenAIResponseInputMessageContentImage(image_url="https://example.com/image.jpg", detail="high")]
 
@@ -111,7 +106,6 @@ class TestConvertResponseContentToChatContent:
 
 
 class TestConvertResponseInputToChatMessages:
-    @pytest.mark.asyncio
     async def test_convert_string_input(self):
         result = await convert_response_input_to_chat_messages("User message")
 
@@ -119,23 +113,30 @@ class TestConvertResponseInputToChatMessages:
         assert isinstance(result[0], OpenAIUserMessageParam)
         assert result[0].content == "User message"
 
-    @pytest.mark.asyncio
     async def test_convert_function_tool_call_output(self):
         input_items = [
+            OpenAIResponseOutputMessageFunctionToolCall(
+                call_id="call_123",
+                name="test_function",
+                arguments='{"param": "value"}',
+            ),
             OpenAIResponseInputFunctionToolCallOutput(
                 output="Tool output",
                 call_id="call_123",
-            )
+            ),
         ]
 
         result = await convert_response_input_to_chat_messages(input_items)
 
-        assert len(result) == 1
-        assert isinstance(result[0], OpenAIToolMessageParam)
-        assert result[0].content == "Tool output"
-        assert result[0].tool_call_id == "call_123"
+        assert len(result) == 2
+        assert isinstance(result[0], OpenAIAssistantMessageParam)
+        assert result[0].tool_calls[0].id == "call_123"
+        assert result[0].tool_calls[0].function.name == "test_function"
+        assert result[0].tool_calls[0].function.arguments == '{"param": "value"}'
+        assert isinstance(result[1], OpenAIToolMessageParam)
+        assert result[1].content == "Tool output"
+        assert result[1].tool_call_id == "call_123"
 
-    @pytest.mark.asyncio
     async def test_convert_function_tool_call(self):
         input_items = [
             OpenAIResponseOutputMessageFunctionToolCall(
@@ -154,7 +155,47 @@ class TestConvertResponseInputToChatMessages:
         assert result[0].tool_calls[0].function.name == "test_function"
         assert result[0].tool_calls[0].function.arguments == '{"param": "value"}'
 
-    @pytest.mark.asyncio
+    async def test_convert_function_call_ordering(self):
+        input_items = [
+            OpenAIResponseOutputMessageFunctionToolCall(
+                call_id="call_123",
+                name="test_function_a",
+                arguments='{"param": "value"}',
+            ),
+            OpenAIResponseOutputMessageFunctionToolCall(
+                call_id="call_456",
+                name="test_function_b",
+                arguments='{"param": "value"}',
+            ),
+            OpenAIResponseInputFunctionToolCallOutput(
+                output="AAA",
+                call_id="call_123",
+            ),
+            OpenAIResponseInputFunctionToolCallOutput(
+                output="BBB",
+                call_id="call_456",
+            ),
+        ]
+
+        result = await convert_response_input_to_chat_messages(input_items)
+        assert len(result) == 4
+        assert isinstance(result[0], OpenAIAssistantMessageParam)
+        assert len(result[0].tool_calls) == 1
+        assert result[0].tool_calls[0].id == "call_123"
+        assert result[0].tool_calls[0].function.name == "test_function_a"
+        assert result[0].tool_calls[0].function.arguments == '{"param": "value"}'
+        assert isinstance(result[1], OpenAIToolMessageParam)
+        assert result[1].content == "AAA"
+        assert result[1].tool_call_id == "call_123"
+        assert isinstance(result[2], OpenAIAssistantMessageParam)
+        assert len(result[2].tool_calls) == 1
+        assert result[2].tool_calls[0].id == "call_456"
+        assert result[2].tool_calls[0].function.name == "test_function_b"
+        assert result[2].tool_calls[0].function.arguments == '{"param": "value"}'
+        assert isinstance(result[3], OpenAIToolMessageParam)
+        assert result[3].content == "BBB"
+        assert result[3].tool_call_id == "call_456"
+
     async def test_convert_response_message(self):
         input_items = [
             OpenAIResponseMessage(
@@ -173,7 +214,6 @@ class TestConvertResponseInputToChatMessages:
 
 
 class TestConvertResponseTextToChatResponseFormat:
-    @pytest.mark.asyncio
     async def test_convert_text_format(self):
         text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
         result = await convert_response_text_to_chat_response_format(text)
@@ -181,14 +221,12 @@ class TestConvertResponseTextToChatResponseFormat:
         assert isinstance(result, OpenAIResponseFormatText)
         assert result.type == "text"
 
-    @pytest.mark.asyncio
     async def test_convert_json_object_format(self):
         text = OpenAIResponseText(format={"type": "json_object"})
         result = await convert_response_text_to_chat_response_format(text)
 
         assert isinstance(result, OpenAIResponseFormatJSONObject)
 
-    @pytest.mark.asyncio
     async def test_convert_json_schema_format(self):
         schema_def = {"type": "object", "properties": {"test": {"type": "string"}}}
         text = OpenAIResponseText(
@@ -204,7 +242,6 @@ class TestConvertResponseTextToChatResponseFormat:
         assert result.json_schema["name"] == "test_schema"
         assert result.json_schema["schema"] == schema_def
 
-    @pytest.mark.asyncio
     async def test_default_text_format(self):
         text = OpenAIResponseText()
         result = await convert_response_text_to_chat_response_format(text)
@@ -214,27 +251,22 @@ class TestConvertResponseTextToChatResponseFormat:
 
 
 class TestGetMessageTypeByRole:
-    @pytest.mark.asyncio
     async def test_user_role(self):
         result = await get_message_type_by_role("user")
         assert result == OpenAIUserMessageParam
 
-    @pytest.mark.asyncio
     async def test_system_role(self):
         result = await get_message_type_by_role("system")
         assert result == OpenAISystemMessageParam
 
-    @pytest.mark.asyncio
     async def test_assistant_role(self):
         result = await get_message_type_by_role("assistant")
         assert result == OpenAIAssistantMessageParam
 
-    @pytest.mark.asyncio
     async def test_developer_role(self):
         result = await get_message_type_by_role("developer")
         assert result == OpenAIDeveloperMessageParam
 
-    @pytest.mark.asyncio
     async def test_unknown_role(self):
         result = await get_message_type_by_role("unknown")
         assert result is None
diff --git a/tests/unit/providers/batches/conftest.py b/tests/unit/providers/batches/conftest.py
new file mode 100644
index 000000000..df37141b5
--- /dev/null
+++ b/tests/unit/providers/batches/conftest.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Shared fixtures for batches provider unit tests."""
+
+import tempfile
+from pathlib import Path
+from unittest.mock import AsyncMock
+
+import pytest
+
+from llama_stack.providers.inline.batches.reference.batches import ReferenceBatchesImpl
+from llama_stack.providers.inline.batches.reference.config import ReferenceBatchesImplConfig
+from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+
+
+@pytest.fixture
+async def provider():
+    """Create a test provider instance with temporary database."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        db_path = Path(tmpdir) / "test_batches.db"
+        kvstore_config = SqliteKVStoreConfig(db_path=str(db_path))
+        config = ReferenceBatchesImplConfig(kvstore=kvstore_config)
+
+        # Create kvstore and mock APIs
+        kvstore = await kvstore_impl(config.kvstore)
+        mock_inference = AsyncMock()
+        mock_files = AsyncMock()
+        mock_models = AsyncMock()
+
+        provider = ReferenceBatchesImpl(config, mock_inference, mock_files, mock_models, kvstore)
+        await provider.initialize()
+
+        # unit tests should not require background processing
+        provider.process_batches = False
+
+        yield provider
+
+        await provider.shutdown()
+
+
+@pytest.fixture
+def sample_batch_data():
+    """Sample batch data for testing."""
+    return {
+        "input_file_id": "file_abc123",
+        "endpoint": "/v1/chat/completions",
+        "completion_window": "24h",
+        "metadata": {"test": "true", "priority": "high"},
+    }
diff --git a/tests/unit/providers/batches/test_reference.py b/tests/unit/providers/batches/test_reference.py
index 9fe0cc710..0ca866f7b 100644
--- a/tests/unit/providers/batches/test_reference.py
+++ b/tests/unit/providers/batches/test_reference.py
@@ -54,60 +54,17 @@ dependencies like inference, files, and models APIs.
 """
 
 import json
-import tempfile
-from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 
 from llama_stack.apis.batches import BatchObject
 from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
-from llama_stack.providers.inline.batches.reference.batches import ReferenceBatchesImpl
-from llama_stack.providers.inline.batches.reference.config import ReferenceBatchesImplConfig
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 
 
 class TestReferenceBatchesImpl:
     """Test the reference implementation of the Batches API."""
 
-    @pytest.fixture
-    async def provider(self):
-        """Create a test provider instance with temporary database."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            db_path = Path(tmpdir) / "test_batches.db"
-            kvstore_config = SqliteKVStoreConfig(db_path=str(db_path))
-            config = ReferenceBatchesImplConfig(kvstore=kvstore_config)
-
-            # Create kvstore and mock APIs
-            from unittest.mock import AsyncMock
-
-            from llama_stack.providers.utils.kvstore import kvstore_impl
-
-            kvstore = await kvstore_impl(config.kvstore)
-            mock_inference = AsyncMock()
-            mock_files = AsyncMock()
-            mock_models = AsyncMock()
-
-            provider = ReferenceBatchesImpl(config, mock_inference, mock_files, mock_models, kvstore)
-            await provider.initialize()
-
-            # unit tests should not require background processing
-            provider.process_batches = False
-
-            yield provider
-
-            await provider.shutdown()
-
-    @pytest.fixture
-    def sample_batch_data(self):
-        """Sample batch data for testing."""
-        return {
-            "input_file_id": "file_abc123",
-            "endpoint": "/v1/chat/completions",
-            "completion_window": "24h",
-            "metadata": {"test": "true", "priority": "high"},
-        }
-
     def _validate_batch_type(self, batch, expected_metadata=None):
         """
         Helper function to validate batch object structure and field types.
diff --git a/tests/unit/providers/batches/test_reference_idempotency.py b/tests/unit/providers/batches/test_reference_idempotency.py
new file mode 100644
index 000000000..e6cb29b9b
--- /dev/null
+++ b/tests/unit/providers/batches/test_reference_idempotency.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Tests for idempotency functionality in the reference batches provider.
+
+This module tests the optional idempotency feature that allows clients to provide
+an idempotency key (idempotency_key) to ensure that repeated requests with the same key
+and parameters return the same batch, while requests with the same key but different
+parameters result in a conflict error.
+
+Test Categories:
+1. Core Idempotency: Same parameters with same key return same batch
+2. Parameter Independence: Different parameters without keys create different batches
+3. Conflict Detection: Same key with different parameters raises ConflictError
+
+Tests by Category:
+
+1. Core Idempotency:
+   - test_idempotent_batch_creation_same_params
+   - test_idempotent_batch_creation_metadata_order_independence
+
+2. Parameter Independence:
+   - test_non_idempotent_behavior_without_key
+   - test_different_idempotency_keys_create_different_batches
+
+3. Conflict Detection:
+   - test_same_idempotency_key_different_params_conflict (parametrized: input_file_id, metadata values, metadata None vs {})
+
+Key Behaviors Tested:
+- Idempotent batch creation when idempotency_key provided with identical parameters
+- Metadata order independence for consistent batch ID generation
+- Non-idempotent behavior when no idempotency_key provided (random UUIDs)
+- Conflict detection for parameter mismatches with same idempotency key
+- Deterministic ID generation based solely on idempotency key
+- Proper error handling with detailed conflict messages including key and error codes
+- Protection against idempotency key reuse with different request parameters
+"""
+
+import asyncio
+
+import pytest
+
+from llama_stack.apis.common.errors import ConflictError
+
+
+class TestReferenceBatchesIdempotency:
+    """Test suite for idempotency functionality in the reference implementation."""
+
+    async def test_idempotent_batch_creation_same_params(self, provider, sample_batch_data):
+        """Test that creating batches with identical parameters returns the same batch when idempotency_key is provided."""
+
+        del sample_batch_data["metadata"]
+
+        batch1 = await provider.create_batch(
+            **sample_batch_data,
+            metadata={"test": "value1", "other": "value2"},
+            idempotency_key="unique-token-1",
+        )
+
+        # sleep for 1 second to allow created_at timestamps to be different
+        await asyncio.sleep(1)
+
+        batch2 = await provider.create_batch(
+            **sample_batch_data,
+            metadata={"other": "value2", "test": "value1"},  # Different order
+            idempotency_key="unique-token-1",
+        )
+
+        assert batch1.id == batch2.id
+        assert batch1.input_file_id == batch2.input_file_id
+        assert batch1.metadata == batch2.metadata
+        assert batch1.created_at == batch2.created_at
+
+    async def test_different_idempotency_keys_create_different_batches(self, provider, sample_batch_data):
+        """Test that different idempotency keys create different batches even with same params."""
+        batch1 = await provider.create_batch(
+            **sample_batch_data,
+            idempotency_key="token-A",
+        )
+
+        batch2 = await provider.create_batch(
+            **sample_batch_data,
+            idempotency_key="token-B",
+        )
+
+        assert batch1.id != batch2.id
+
+    async def test_non_idempotent_behavior_without_key(self, provider, sample_batch_data):
+        """Test that batches without idempotency key create unique batches even with identical parameters."""
+        batch1 = await provider.create_batch(**sample_batch_data)
+
+        batch2 = await provider.create_batch(**sample_batch_data)
+
+        assert batch1.id != batch2.id
+        assert batch1.input_file_id == batch2.input_file_id
+        assert batch1.endpoint == batch2.endpoint
+        assert batch1.completion_window == batch2.completion_window
+        assert batch1.metadata == batch2.metadata
+
+    @pytest.mark.parametrize(
+        "param_name,first_value,second_value",
+        [
+            ("input_file_id", "file_001", "file_002"),
+            ("metadata", {"test": "value1"}, {"test": "value2"}),
+            ("metadata", None, {}),
+        ],
+    )
+    async def test_same_idempotency_key_different_params_conflict(
+        self, provider, sample_batch_data, param_name, first_value, second_value
+    ):
+        """Test that same idempotency_key with different parameters raises conflict error."""
+        sample_batch_data["idempotency_key"] = "same-token"
+
+        sample_batch_data[param_name] = first_value
+
+        batch1 = await provider.create_batch(**sample_batch_data)
+
+        with pytest.raises(ConflictError, match="Idempotency key.*was previously used with different parameters"):
+            sample_batch_data[param_name] = second_value
+            await provider.create_batch(**sample_batch_data)
+
+        retrieved_batch = await provider.retrieve_batch(batch1.id)
+        assert retrieved_batch.id == batch1.id
+        assert getattr(retrieved_batch, param_name) == first_value
diff --git a/tests/unit/providers/files/test_s3_files.py b/tests/unit/providers/files/test_s3_files.py
new file mode 100644
index 000000000..daa250f10
--- /dev/null
+++ b/tests/unit/providers/files/test_s3_files.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from unittest.mock import patch
+
+import boto3
+import pytest
+from botocore.exceptions import ClientError
+from moto import mock_aws
+
+from llama_stack.apis.common.errors import ResourceNotFoundError
+from llama_stack.apis.files import OpenAIFilePurpose
+from llama_stack.providers.remote.files.s3 import (
+    S3FilesImplConfig,
+    get_adapter_impl,
+)
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
+
+
+class MockUploadFile:
+    def __init__(self, content: bytes, filename: str, content_type: str = "text/plain"):
+        self.content = content
+        self.filename = filename
+        self.content_type = content_type
+
+    async def read(self):
+        return self.content
+
+
+@pytest.fixture
+def s3_config(tmp_path):
+    db_path = tmp_path / "s3_files_metadata.db"
+
+    return S3FilesImplConfig(
+        bucket_name="test-bucket",
+        region="not-a-region",
+        auto_create_bucket=True,
+        metadata_store=SqliteSqlStoreConfig(db_path=db_path.as_posix()),
+    )
+
+
+@pytest.fixture
+def s3_client():
+    """Create a mocked S3 client for testing."""
+    # we use `with mock_aws()` because @mock_aws decorator does not support being a generator
+    with mock_aws():
+        # must yield or the mock will be reset before it is used
+        yield boto3.client("s3")
+
+
+@pytest.fixture
+async def s3_provider(s3_config, s3_client):
+    """Create an S3 files provider with mocked S3 for testing."""
+    provider = await get_adapter_impl(s3_config, {})
+    yield provider
+    await provider.shutdown()
+
+
+@pytest.fixture
+def sample_text_file():
+    content = b"Hello, this is a test file for the S3 Files API!"
+    return MockUploadFile(content, "sample_text_file.txt")
+
+
+class TestS3FilesImpl:
+    """Test suite for S3 Files implementation."""
+
+    async def test_upload_file(self, s3_provider, sample_text_file, s3_client, s3_config):
+        """Test successful file upload."""
+        sample_text_file.filename = "test_upload_file"
+        result = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        assert result.filename == sample_text_file.filename
+        assert result.purpose == OpenAIFilePurpose.ASSISTANTS
+        assert result.bytes == len(sample_text_file.content)
+        assert result.id.startswith("file-")
+
+        # Verify file exists in S3 backend
+        response = s3_client.head_object(Bucket=s3_config.bucket_name, Key=result.id)
+        assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
+
+    async def test_list_files_empty(self, s3_provider):
+        """Test listing files when no files exist."""
+        result = await s3_provider.openai_list_files()
+
+        assert len(result.data) == 0
+        assert not result.has_more
+        assert result.first_id == ""
+        assert result.last_id == ""
+
+    async def test_retrieve_file(self, s3_provider, sample_text_file):
+        """Test retrieving file metadata."""
+        sample_text_file.filename = "test_retrieve_file"
+        uploaded = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        retrieved = await s3_provider.openai_retrieve_file(uploaded.id)
+
+        assert retrieved.id == uploaded.id
+        assert retrieved.filename == uploaded.filename
+        assert retrieved.purpose == uploaded.purpose
+        assert retrieved.bytes == uploaded.bytes
+
+    async def test_retrieve_file_content(self, s3_provider, sample_text_file):
+        """Test retrieving file content."""
+        sample_text_file.filename = "test_retrieve_file_content"
+        uploaded = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        response = await s3_provider.openai_retrieve_file_content(uploaded.id)
+
+        assert response.body == sample_text_file.content
+        assert response.headers["Content-Disposition"] == f'attachment; filename="{sample_text_file.filename}"'
+
+    async def test_delete_file(self, s3_provider, sample_text_file, s3_config, s3_client):
+        """Test deleting a file."""
+        sample_text_file.filename = "test_delete_file"
+        uploaded = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        delete_response = await s3_provider.openai_delete_file(uploaded.id)
+
+        assert delete_response.id == uploaded.id
+        assert delete_response.deleted is True
+
+        with pytest.raises(ResourceNotFoundError, match="not found"):
+            await s3_provider.openai_retrieve_file(uploaded.id)
+
+        # Verify file is gone from S3 backend
+        with pytest.raises(ClientError) as exc_info:
+            s3_client.head_object(Bucket=s3_config.bucket_name, Key=uploaded.id)
+        assert exc_info.value.response["Error"]["Code"] == "404"
+
+    async def test_list_files(self, s3_provider, sample_text_file):
+        """Test listing files after uploading some."""
+        sample_text_file.filename = "test_list_files_with_content_file1"
+        file1 = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        file2_content = MockUploadFile(b"Second file content", "test_list_files_with_content_file2")
+        file2 = await s3_provider.openai_upload_file(
+            file=file2_content,
+            purpose=OpenAIFilePurpose.BATCH,
+        )
+
+        result = await s3_provider.openai_list_files()
+
+        assert len(result.data) == 2
+        file_ids = {f.id for f in result.data}
+        assert file1.id in file_ids
+        assert file2.id in file_ids
+
+    async def test_list_files_with_purpose_filter(self, s3_provider, sample_text_file):
+        """Test listing files with purpose filter."""
+        sample_text_file.filename = "test_list_files_with_purpose_filter_file1"
+        file1 = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        file2_content = MockUploadFile(b"Batch file content", "test_list_files_with_purpose_filter_file2")
+        await s3_provider.openai_upload_file(
+            file=file2_content,
+            purpose=OpenAIFilePurpose.BATCH,
+        )
+
+        result = await s3_provider.openai_list_files(purpose=OpenAIFilePurpose.ASSISTANTS)
+
+        assert len(result.data) == 1
+        assert result.data[0].id == file1.id
+        assert result.data[0].purpose == OpenAIFilePurpose.ASSISTANTS
+
+    async def test_nonexistent_file_retrieval(self, s3_provider):
+        """Test retrieving a non-existent file raises error."""
+        with pytest.raises(ResourceNotFoundError, match="not found"):
+            await s3_provider.openai_retrieve_file("file-nonexistent")
+
+    async def test_nonexistent_file_content_retrieval(self, s3_provider):
+        """Test retrieving content of a non-existent file raises error."""
+        with pytest.raises(ResourceNotFoundError, match="not found"):
+            await s3_provider.openai_retrieve_file_content("file-nonexistent")
+
+    async def test_nonexistent_file_deletion(self, s3_provider):
+        """Test deleting a non-existent file raises error."""
+        with pytest.raises(ResourceNotFoundError, match="not found"):
+            await s3_provider.openai_delete_file("file-nonexistent")
+
+    async def test_upload_file_without_filename(self, s3_provider, sample_text_file):
+        """Test uploading a file without a filename uses the fallback."""
+        del sample_text_file.filename
+        result = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        assert result.purpose == OpenAIFilePurpose.ASSISTANTS
+        assert result.bytes == len(sample_text_file.content)
+
+        retrieved = await s3_provider.openai_retrieve_file(result.id)
+        assert retrieved.filename == result.filename
+
+    async def test_file_operations_when_s3_object_deleted(self, s3_provider, sample_text_file, s3_config, s3_client):
+        """Test file operations when S3 object is deleted but metadata exists (negative test)."""
+        sample_text_file.filename = "test_orphaned_metadata"
+        uploaded = await s3_provider.openai_upload_file(
+            file=sample_text_file,
+            purpose=OpenAIFilePurpose.ASSISTANTS,
+        )
+
+        # Directly delete the S3 object from the backend
+        s3_client.delete_object(Bucket=s3_config.bucket_name, Key=uploaded.id)
+
+        with pytest.raises(ResourceNotFoundError, match="not found") as exc_info:
+            await s3_provider.openai_retrieve_file_content(uploaded.id)
+        assert uploaded.id in str(exc_info).lower()
+
+        listed_files = await s3_provider.openai_list_files()
+        assert uploaded.id not in [file.id for file in listed_files.data]
+
+    async def test_upload_file_s3_put_object_failure(self, s3_provider, sample_text_file, s3_config, s3_client):
+        """Test that put_object failure results in exception and no orphaned metadata."""
+        sample_text_file.filename = "test_s3_put_object_failure"
+
+        def failing_put_object(*args, **kwargs):
+            raise ClientError(
+                error_response={"Error": {"Code": "SolarRadiation", "Message": "Bloop"}}, operation_name="PutObject"
+            )
+
+        with patch.object(s3_provider.client, "put_object", side_effect=failing_put_object):
+            with pytest.raises(RuntimeError, match="Failed to upload file to S3"):
+                await s3_provider.openai_upload_file(
+                    file=sample_text_file,
+                    purpose=OpenAIFilePurpose.ASSISTANTS,
+                )
+
+        files_list = await s3_provider.openai_list_files()
+        assert len(files_list.data) == 0, "No file metadata should remain after failed upload"
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 5c2ad03ab..ce0e930b1 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -6,7 +6,7 @@
 
 import asyncio
 import json
-import logging
+import logging  # allow-direct-logging
 import threading
 import time
 from http.server import BaseHTTPRequestHandler, HTTPServer
diff --git a/tests/unit/server/test_cors.py b/tests/unit/server/test_cors.py
new file mode 100644
index 000000000..8fd2515ba
--- /dev/null
+++ b/tests/unit/server/test_cors.py
@@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+
+from llama_stack.core.datatypes import CORSConfig, process_cors_config
+
+
+def test_cors_config_defaults():
+    config = CORSConfig()
+
+    assert config.allow_origins == []
+    assert config.allow_origin_regex is None
+    assert config.allow_methods == ["OPTIONS"]
+    assert config.allow_headers == []
+    assert config.allow_credentials is False
+    assert config.expose_headers == []
+    assert config.max_age == 600
+
+
+def test_cors_config_explicit_config():
+    config = CORSConfig(
+        allow_origins=["https://example.com"], allow_credentials=True, max_age=3600, allow_methods=["GET", "POST"]
+    )
+
+    assert config.allow_origins == ["https://example.com"]
+    assert config.allow_credentials is True
+    assert config.max_age == 3600
+    assert config.allow_methods == ["GET", "POST"]
+
+
+def test_cors_config_regex():
+    config = CORSConfig(allow_origins=[], allow_origin_regex=r"https?://localhost:\d+")
+
+    assert config.allow_origins == []
+    assert config.allow_origin_regex == r"https?://localhost:\d+"
+
+
+def test_cors_config_wildcard_credentials_error():
+    with pytest.raises(ValueError, match="Cannot use wildcard origins with credentials enabled"):
+        CORSConfig(allow_origins=["*"], allow_credentials=True)
+
+    with pytest.raises(ValueError, match="Cannot use wildcard origins with credentials enabled"):
+        CORSConfig(allow_origins=["https://example.com", "*"], allow_credentials=True)
+
+
+def test_process_cors_config_false():
+    result = process_cors_config(False)
+    assert result is None
+
+
+def test_process_cors_config_true():
+    result = process_cors_config(True)
+
+    assert isinstance(result, CORSConfig)
+    assert result.allow_origins == []
+    assert result.allow_origin_regex == r"https?://localhost:\d+"
+    assert result.allow_credentials is False
+    expected_methods = ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
+    for method in expected_methods:
+        assert method in result.allow_methods
+
+
+def test_process_cors_config_passthrough():
+    original = CORSConfig(allow_origins=["https://example.com"], allow_methods=["GET"])
+    result = process_cors_config(original)
+
+    assert result is original
+
+
+def test_process_cors_config_invalid_type():
+    with pytest.raises(ValueError, match="Expected bool or CORSConfig, got str"):
+        process_cors_config("invalid")
+
+
+def test_cors_config_model_dump():
+    cors_config = CORSConfig(
+        allow_origins=["https://example.com"],
+        allow_methods=["GET", "POST"],
+        allow_headers=["Content-Type"],
+        allow_credentials=True,
+        max_age=3600,
+    )
+
+    config_dict = cors_config.model_dump()
+
+    assert config_dict["allow_origins"] == ["https://example.com"]
+    assert config_dict["allow_methods"] == ["GET", "POST"]
+    assert config_dict["allow_headers"] == ["Content-Type"]
+    assert config_dict["allow_credentials"] is True
+    assert config_dict["max_age"] == 3600
+
+    expected_keys = {
+        "allow_origins",
+        "allow_origin_regex",
+        "allow_methods",
+        "allow_headers",
+        "allow_credentials",
+        "expose_headers",
+        "max_age",
+    }
+    assert set(config_dict.keys()) == expected_keys
diff --git a/uv.lock b/uv.lock
index 0cb2164db..385c75bea 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.12"
 resolution-markers = [
     "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -347,6 +347,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/4d/1392562369b1139e741b30d624f09fe7091d17dd5579fae5732f044b12bb/blobfile-3.0.0-py3-none-any.whl", hash = "sha256:48ecc3307e622804bd8fe13bf6f40e6463c4439eba7a1f9ad49fd78aa63cc658", size = 75413, upload-time = "2024-08-27T00:02:51.518Z" },
 ]
 
+[[package]]
+name = "boto3"
+version = "1.40.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/41/19/2c4d140a7f99b5903b21b9ccd7253c71f147c346c3c632b2117444cf2d65/boto3-1.40.12.tar.gz", hash = "sha256:c6b32aee193fbd2eb84696d2b5b2410dcda9fb4a385e1926cff908377d222247", size = 111959, upload-time = "2025-08-18T19:30:23.827Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/6e/5a9dcf38ad87838fb99742c4a3ab1b7507ad3a02c8c27a9ccda7a0bb5709/boto3-1.40.12-py3-none-any.whl", hash = "sha256:3c3d6731390b5b11f5e489d5d9daa57f0c3e171efb63ac8f47203df9c71812b3", size = 140075, upload-time = "2025-08-18T19:30:22.494Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.40.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/b2/7933590fc5bca1980801b71e09db1a95581afff177cbf3c8a031d922885c/botocore-1.40.12.tar.gz", hash = "sha256:c6560578e799b47b762b7e555bd9c5dd5c29c5d23bd778a8a72e98c979b3c727", size = 14349930, upload-time = "2025-08-18T19:30:13.794Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/b6/65fd6e718c9538ba1462c9b71e9262bc723202ff203fe64ff66ff676d823/botocore-1.40.12-py3-none-any.whl", hash = "sha256:84e96004a8b426c5508f6b5600312d6271364269466a3a957dc377ad8effc438", size = 14018004, upload-time = "2025-08-18T19:30:09.054Z" },
+]
+
 [[package]]
 name = "braintrust-core"
 version = "0.0.59"
@@ -523,7 +551,7 @@ wheels = [
 
 [[package]]
 name = "chromadb"
-version = "1.0.16"
+version = "1.0.20"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "bcrypt" },
@@ -554,13 +582,13 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "uvicorn", extra = ["standard"] },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/15/2a/5b7e793d2a27c425e9f1813e9cb965b70e9bda08b69ee15a10e07dc3e59a/chromadb-1.0.16.tar.gz", hash = "sha256:3c864b5beb5e131bdc1f83c0b63a01ec481c6ee52028f088563ffba8478478e1", size = 1241545, upload-time = "2025-08-08T00:25:41.414Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/5d/430c4780738ed8385afb2031c619c71e4d354b435f1523fd628562d42377/chromadb-1.0.20.tar.gz", hash = "sha256:9ca88516f1eefa26e4c308ec9bdae9d209c0ba5fe1fae3f16b250e52246944db", size = 1244999, upload-time = "2025-08-18T17:03:31.195Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a3/9d/bffcc814272c9b7982551803b2d45b77f39eeea1b9e965c00c05ee81c649/chromadb-1.0.16-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:144163ce7ca4f4448684d5d0c13ebb37c4d68490ecb60967a95d05cea30e0d2d", size = 18942157, upload-time = "2025-08-08T00:25:38.459Z" },
-    { url = "https://files.pythonhosted.org/packages/58/4e/de0086f3cbcfd667d75d112bb546386803ab5335599bf7099272a675e98b/chromadb-1.0.16-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:4ebcc5894e6fbb6b576452bbf4659746bfe58d9daf99a18363364e9497434bd2", size = 18147831, upload-time = "2025-08-08T00:25:35.546Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/7f/a8aff4ce96281bcb9731d10b2554f41963dd0b47acb4f90a78b2b7c4f199/chromadb-1.0.16-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:937051fc3aae94f7c171503d8f1f7662820aacc75acf45f28d3656c75c5ff1f8", size = 18682195, upload-time = "2025-08-08T00:25:29.654Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/9c/2a97d0257176aae472dff6f1ef1b7050449f384e420120e0f31d2d8f532f/chromadb-1.0.16-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0f5c5ad0c59154a9cab1506b857bab8487b588352e668cf1222c54bb9d52daa", size = 19635695, upload-time = "2025-08-08T00:25:32.68Z" },
-    { url = "https://files.pythonhosted.org/packages/96/8a/f7e810f3cbdc9186ba4e649dc32711b7ab2c23aba37cf61175f731d22293/chromadb-1.0.16-cp39-abi3-win_amd64.whl", hash = "sha256:2528c01bd8b3facca9d0e1ffac866767c386b94604df484fc792ee891c86e09a", size = 19641144, upload-time = "2025-08-08T00:25:43.446Z" },
+    { url = "https://files.pythonhosted.org/packages/59/2f/d40a4aedd9298a012fb9f455a1e334fc875e12c9c667aab8a956a9dff559/chromadb-1.0.20-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0955b9cbd0dfe23ecfd8d911254ff9e57750acbe9c5ff723e2975290092d9d29", size = 19069234, upload-time = "2025-08-18T17:03:28.714Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/2e/fcc80bb635719d3cf0705be89e2510bd191d5f544d1c5e9e4392ba95cff4/chromadb-1.0.20-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:52819408a48f0209a0ce4e6655eaaa683cce03f8081f297f88699f00bc8281aa", size = 18264273, upload-time = "2025-08-18T17:03:25.614Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/de/e93edfcebf863d652bb0c03c23ae5a4e9e448b6e01fdac8a8624aa7dd2a4/chromadb-1.0.20-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68dbe15270e743077d47360695e0af918d17b225011e00d491afefbee017097f", size = 18835560, upload-time = "2025-08-18T17:03:18.783Z" },
+    { url = "https://files.pythonhosted.org/packages/61/4f/c88ead80ae78c839152cca5dc6edae65b8a1da090b7220739b54c75549eb/chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2044e1400f67588271ebd2fa654dd5333e9ad108f800aa57a6fa09237afb6142", size = 19755334, upload-time = "2025-08-18T17:03:22.386Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/81/6decbd21c67572d67707f7e168851f10404e2857897456c6ba220e9b09be/chromadb-1.0.20-cp39-abi3-win_amd64.whl", hash = "sha256:b81be370b7c34138c01a41d11304498a13598cf9b21ecde31bba932492071301", size = 19778671, upload-time = "2025-08-18T17:03:33.206Z" },
 ]
 
 [[package]]
@@ -1235,19 +1263,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5a/96/44759eca966720d0f3e1b105c43f8ad4590c97bf8eb3cd489656e9590baa/grpcio-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:fa0c739ad8b1996bd24823950e3cb5152ae91fca1c09cc791190bf1627ffefba", size = 4346042, upload-time = "2024-10-29T06:25:21.939Z" },
 ]
 
-[[package]]
-name = "grpcio-health-checking"
-version = "1.67.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "grpcio" },
-    { name = "protobuf" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/64/dd/e3b339fa44dc75b501a1a22cb88f1af5b1f8c964488f19c4de4cfbbf05ba/grpcio_health_checking-1.67.1.tar.gz", hash = "sha256:ca90fa76a6afbb4fda71d734cb9767819bba14928b91e308cffbb0c311eb941e", size = 16775, upload-time = "2024-10-29T06:30:16.487Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/8d/7a9878dca6616b48093d71c52d0bc79cb2dd1a2698ff6f5ce7406306de12/grpcio_health_checking-1.67.1-py3-none-any.whl", hash = "sha256:93753da5062152660aef2286c9b261e07dd87124a65e4dc9fbd47d1ce966b39d", size = 18924, upload-time = "2024-10-29T06:26:25.535Z" },
-]
-
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1593,6 +1608,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },
 ]
 
+[[package]]
+name = "jmespath"
+version = "1.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
+]
+
 [[package]]
 name = "jsonschema"
 version = "4.25.0"
@@ -1702,7 +1726,7 @@ wheels = [
 
 [[package]]
 name = "llama-api-client"
-version = "0.1.2"
+version = "0.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1712,14 +1736,14 @@ dependencies = [
     { name = "sniffio" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d0/78/875de3a16efd0442718ac47cc27319cd80cc5f38e12298e454e08611acc4/llama_api_client-0.1.2.tar.gz", hash = "sha256:709011f2d506009b1b3b3bceea1c84f2a3a7600df1420fb256e680fcd7251387", size = 113695, upload-time = "2025-06-27T19:56:14.057Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/59/41/fa8521a0faff96bf5f810e2ab5b78c638f5ba44afd09aa86f94b6a1226ad/llama_api_client-0.2.0.tar.gz", hash = "sha256:b9bd5f5ad332b9133f0775a105f0940f057cbb311891f1d4487247d001c31f17", size = 117108, upload-time = "2025-08-12T17:07:07.734Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/99/08/5d7e6e7e6af5353391376288c200acacebb8e6b156d3636eae598a451673/llama_api_client-0.1.2-py3-none-any.whl", hash = "sha256:8ad6e10726f74b2302bfd766c61c41355a9ecf60f57cde2961882d22af998941", size = 84091, upload-time = "2025-06-27T19:56:12.8Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/11/198e65c1a50d9e839b4e3d346b4bd0f624e532446e468d1aba6c74ed7484/llama_api_client-0.2.0-py3-none-any.whl", hash = "sha256:50614ed991e1a72439e6a624a97e6000615ada1b9e2046ecc026fe62f107663c", size = 85002, upload-time = "2025-08-12T17:07:06.293Z" },
 ]
 
 [[package]]
 name = "llama-stack"
-version = "0.2.17"
+version = "0.2.18"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },
@@ -1833,6 +1857,7 @@ unit = [
     { name = "litellm" },
     { name = "mcp" },
     { name = "milvus-lite" },
+    { name = "moto", extra = ["s3"] },
     { name = "ollama" },
     { name = "openai" },
     { name = "pymilvus" },
@@ -1856,8 +1881,8 @@ requires-dist = [
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "llama-api-client", specifier = ">=0.1.2" },
-    { name = "llama-stack-client", specifier = ">=0.2.17" },
-    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.17" },
+    { name = "llama-stack-client", specifier = ">=0.2.18" },
+    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.18" },
     { name = "openai", specifier = ">=1.99.6,<1.100.0" },
     { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
     { name = "opentelemetry-sdk", specifier = ">=1.30.0" },
@@ -1950,6 +1975,7 @@ unit = [
     { name = "litellm" },
     { name = "mcp" },
     { name = "milvus-lite", specifier = ">=2.5.0" },
+    { name = "moto", extras = ["s3"], specifier = ">=5.1.10" },
     { name = "ollama" },
     { name = "openai" },
     { name = "pymilvus", specifier = ">=2.5.12" },
@@ -1963,7 +1989,7 @@ unit = [
 
 [[package]]
 name = "llama-stack-client"
-version = "0.2.17"
+version = "0.2.18"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1982,14 +2008,14 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c5/2a/bb2949d6a5c494d21da0c185d426e25eaa8016f8287b689249afc6c96fb5/llama_stack_client-0.2.17.tar.gz", hash = "sha256:1fe2070133c6356761e394fa346045e9b6b567d4c63157b9bc6be89b9a6e7a41", size = 257636, upload-time = "2025-08-05T01:42:55.911Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/da/5e5a745495f8a2b8ef24fc4d01fe9031aa2277c36447cb22192ec8c8cc1e/llama_stack_client-0.2.18.tar.gz", hash = "sha256:860c885c9e549445178ac55cc9422e6e2a91215ac7aff5aaccfb42f3ce07e79e", size = 277284, upload-time = "2025-08-19T22:12:09.106Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/fc/5eccc86b83c5ced3a3bca071d250a86ccafa4ff17546cf781deb7758ab74/llama_stack_client-0.2.17-py3-none-any.whl", hash = "sha256:336c32f8688700ff64717b8109f405dc87a990fbe310c2027ac9ed6d39d67d16", size = 350329, upload-time = "2025-08-05T01:42:54.381Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/e4/e97f8fdd8a07aa1efc7f7e37b5657d84357b664bf70dd1885a437edc0699/llama_stack_client-0.2.18-py3-none-any.whl", hash = "sha256:90f827d5476f7fc15fd993f1863af6a6e72bd064646bf6a99435eb43a1327f70", size = 367586, upload-time = "2025-08-19T22:12:07.899Z" },
 ]
 
 [[package]]
 name = "locust"
-version = "2.38.0"
+version = "2.39.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "configargparse" },
@@ -2001,15 +2027,16 @@ dependencies = [
     { name = "locust-cloud" },
     { name = "msgpack" },
     { name = "psutil" },
+    { name = "python-socketio", extra = ["client"] },
     { name = "pywin32", marker = "sys_platform == 'win32'" },
     { name = "pyzmq" },
     { name = "requests" },
     { name = "setuptools" },
     { name = "werkzeug" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fb/93/ecd79dde28e24bdc99488d4e2c0ad4117252257d5cbdd61e3b14d1f03786/locust-2.38.0.tar.gz", hash = "sha256:5bd6c29d8423733cb5d9a265548c9fef7b731f2254aa91885d6c98d0d39f90f0", size = 1406518, upload-time = "2025-08-07T10:18:52.584Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/6f/d6ca4483f4795747fbbd610d28e798ca4f5d4358e03f309343eb5bab128f/locust-2.39.0.tar.gz", hash = "sha256:71e82a68324f9d63d4b800035288488c08eab12811fa4c24ff07f031643b7b39", size = 1409879, upload-time = "2025-08-20T13:39:55.233Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/be/57ca67b95c45e69c173e86fe5c934d789effc2ec203d3e3ec2a0b32aa707/locust-2.38.0-py3-none-any.whl", hash = "sha256:b92c937e8659e9ffd6d6d1cab2f63f70aa98c87975911938d1f473534f46fd78", size = 1424083, upload-time = "2025-08-07T10:18:50.499Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/94/7dc9a2b4ccb18a5b0c4be4bfadfa79b6c0fd860267a7114641402627e7db/locust-2.39.0-py3-none-any.whl", hash = "sha256:3817c4d7cca387b4b871da779c9e145c2a95fbb0b5602be5833976902b967a8f", size = 1428138, upload-time = "2025-08-20T13:39:52.549Z" },
 ]
 
 [[package]]
@@ -2236,6 +2263,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/16/71/4ad9a42f2772793a03cb698f0fc42499f04e6e8d2560ba2f7da0fb059a8e/mmh3-5.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:b22fe2e54be81f6c07dcb36b96fa250fb72effe08aa52fbb83eade6e1e2d5fd7", size = 38890, upload-time = "2025-01-25T08:39:25.28Z" },
 ]
 
+[[package]]
+name = "moto"
+version = "5.1.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "boto3" },
+    { name = "botocore" },
+    { name = "cryptography" },
+    { name = "jinja2" },
+    { name = "python-dateutil" },
+    { name = "requests" },
+    { name = "responses" },
+    { name = "werkzeug" },
+    { name = "xmltodict" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/72/9bc9b4917b816f5a82fc8f0fbd477c2a669d35a7d7941ae15a5411e266d6/moto-5.1.10.tar.gz", hash = "sha256:d6bdc8f82a1e503502927cc0a3da22014f836094d0bf399bb0f695754ae6c7a6", size = 7087004, upload-time = "2025-08-11T20:59:45.542Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/37/9b9cb5597eecc2ebfde2f65a8265f3669f6724ebe82bf9b155a3421039f8/moto-5.1.10-py3-none-any.whl", hash = "sha256:9ec1a21a924f97470af225b2bfa854fe46c1ad30fb44655eba458206dedf28b5", size = 5246859, upload-time = "2025-08-11T20:59:43.22Z" },
+]
+
+[package.optional-dependencies]
+s3 = [
+    { name = "py-partiql-parser" },
+    { name = "pyyaml" },
+]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -3080,6 +3133,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
 ]
 
+[[package]]
+name = "py-partiql-parser"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/a1/0a2867e48b232b4f82c4929ef7135f2a5d72c3886b957dccf63c70aa2fcb/py_partiql_parser-0.6.1.tar.gz", hash = "sha256:8583ff2a0e15560ef3bc3df109a7714d17f87d81d33e8c38b7fed4e58a63215d", size = 17120, upload-time = "2024-12-25T22:06:41.327Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/97/84/0e410c20bbe9a504fc56e97908f13261c2b313d16cbb3b738556166f044a/py_partiql_parser-0.6.1-py2.py3-none-any.whl", hash = "sha256:ff6a48067bff23c37e9044021bf1d949c83e195490c17e020715e927fe5b2456", size = 23520, upload-time = "2024-12-25T22:06:39.106Z" },
+]
+
 [[package]]
 name = "pyaml"
 version = "25.7.0"
@@ -3800,6 +3862,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" },
 ]
 
+[[package]]
+name = "responses"
+version = "0.25.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/95/89c054ad70bfef6da605338b009b2e283485835351a9935c7bfbfaca7ffc/responses-0.25.8.tar.gz", hash = "sha256:9374d047a575c8f781b94454db5cab590b6029505f488d12899ddb10a4af1cf4", size = 79320, upload-time = "2025-08-08T19:01:46.709Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1c/4c/cc276ce57e572c102d9542d383b2cfd551276581dc60004cb94fe8774c11/responses-0.25.8-py3-none-any.whl", hash = "sha256:0c710af92def29c8352ceadff0c3fe340ace27cf5af1bbe46fb71275bcd2831c", size = 34769, upload-time = "2025-08-08T19:01:45.018Z" },
+]
+
 [[package]]
 name = "rich"
 version = "14.1.0"
@@ -3973,6 +4049,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/00/db/c376b0661c24cf770cb8815268190668ec1330eba8374a126ceef8c72d55/ruff-0.12.5-py3-none-win_arm64.whl", hash = "sha256:48cdbfc633de2c5c37d9f090ba3b352d1576b0015bfc3bc98eaf230275b7e805", size = 11951564, upload-time = "2025-07-24T13:26:34.994Z" },
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.13.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/05/d52bf1e65044b4e5e27d4e63e8d1579dbdec54fce685908ae09bc3720030/s3transfer-0.13.1.tar.gz", hash = "sha256:c3fdba22ba1bd367922f27ec8032d6a1cf5f10c934fb5d68cf60fd5a23d936cf", size = 150589, upload-time = "2025-07-18T19:22:42.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl", hash = "sha256:a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724", size = 85308, upload-time = "2025-07-18T19:22:40.947Z" },
+]
+
 [[package]]
 name = "safetensors"
 version = "0.5.3"
@@ -5039,20 +5127,20 @@ wheels = [
 
 [[package]]
 name = "weaviate-client"
-version = "4.16.5"
+version = "4.16.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "authlib" },
     { name = "deprecation" },
     { name = "grpcio" },
-    { name = "grpcio-health-checking" },
     { name = "httpx" },
+    { name = "protobuf" },
     { name = "pydantic" },
     { name = "validators" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ad/d1/9f51e3bfea67ec8afaaed175b4d8d22a8bbba0622f9bcd8b064d53a57f91/weaviate_client-4.16.5.tar.gz", hash = "sha256:3359d7bc77aa4a27e6ecfed82017fc32ddfdda6299a6ffd4cf1f09c33023b147", size = 779506, upload-time = "2025-08-01T09:29:06.183Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f4/e4/6a0b1501645f17a851067fc7bd0d5b53dc9777f2818be9c43debe06eda19/weaviate_client-4.16.9.tar.gz", hash = "sha256:d461071f1ff5ebddd0fc697959628a1d8caa12af1da071401ef25583c3084eba", size = 766390, upload-time = "2025-08-20T15:00:03.924Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/4c/e5b3c67fa2b735a572d06095f524f6e2e0f9b47bb99f3c91b9fe3e291a88/weaviate_client-4.16.5-py3-none-any.whl", hash = "sha256:1c5002ea72ba285c3c000a01d498267f8c3da51acf19d0f321f3f8ecbb58411a", size = 597199, upload-time = "2025-08-01T09:29:04.385Z" },
+    { url = "https://files.pythonhosted.org/packages/10/1a/fc66f5f33961351c759d56453d18176849da8f64186c941183bb574b808b/weaviate_client-4.16.9-py3-none-any.whl", hash = "sha256:8b4adabaec0d513edef94c8c1de61c89a86eba3b63a4dc1acdfc9580e80199f4", size = 579098, upload-time = "2025-08-20T15:00:01.882Z" },
 ]
 
 [[package]]
@@ -5119,6 +5207,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226, upload-time = "2022-08-23T19:58:19.96Z" },
 ]
 
+[[package]]
+name = "xmltodict"
+version = "0.14.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/50/05/51dcca9a9bf5e1bce52582683ce50980bcadbc4fa5143b9f2b19ab99958f/xmltodict-0.14.2.tar.gz", hash = "sha256:201e7c28bb210e374999d1dde6382923ab0ed1a8a5faeece48ab525b7810a553", size = 51942, upload-time = "2024-10-16T06:10:29.683Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d6/45/fc303eb433e8a2a271739c98e953728422fa61a3c1f36077a49e395c972e/xmltodict-0.14.2-py2.py3-none-any.whl", hash = "sha256:20cc7d723ed729276e808f26fb6b3599f786cbc37e06c65e192ba77c40f20aac", size = 9981, upload-time = "2024-10-16T06:10:27.649Z" },
+]
+
 [[package]]
 name = "xxhash"
 version = "3.5.0"