diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 54c01c80d..5884f2582 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,4 @@ # These owners will be the default owners for everything in # the repo. Unless a later match takes precedence, -* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb +* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index af2058b9a..263828e1c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,10 +1,8 @@ # What does this PR do? -[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] + -[//]: # (If resolving an issue, uncomment and update the line below) -[//]: # (Closes #[issue-number]) + + ## Test Plan -[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] - -[//]: # (## Documentation) + diff --git a/.github/TRIAGERS.md b/.github/TRIAGERS.md index d4ef6d1ac..586a5a506 100644 --- a/.github/TRIAGERS.md +++ b/.github/TRIAGERS.md @@ -1,2 +1,2 @@ # This file documents Triage members in the Llama Stack community -@franciscojavierarceo @leseb + @bbrowning @booxter @franciscojavierarceo @leseb diff --git a/.github/actions/setup-ollama/action.yml b/.github/actions/setup-ollama/action.yml new file mode 100644 index 000000000..3dd6c940c --- /dev/null +++ b/.github/actions/setup-ollama/action.yml @@ -0,0 +1,26 @@ +name: Setup Ollama +description: Start Ollama and cache model +inputs: + models: + description: Comma-separated list of models to pull + default: "llama3.2:3b-instruct-fp16,all-minilm:latest" +runs: + using: "composite" + steps: + - name: Install and start Ollama + shell: bash + run: | + # the ollama installer also starts the ollama service + curl -fsSL https://ollama.com/install.sh | sh + + # Do NOT cache models - pulling the cache is actually slower than just pulling the model. + # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to + # pull them directly. + # Maybe this is because the cache is being pulled at the same time by all the matrix jobs? + - name: Pull requested models + if: inputs.models != '' + shell: bash + run: | + for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do + ollama pull "$model" + done diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml new file mode 100644 index 000000000..6cba4fdc3 --- /dev/null +++ b/.github/actions/setup-runner/action.yml @@ -0,0 +1,22 @@ +name: Setup runner +description: Prepare a runner for the tests (install uv, python, project dependencies, etc.) +runs: + using: "composite" + steps: + - name: Install uv + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 + with: + python-version: "3.10" + activate-environment: true + version: 0.7.6 + + - name: Install dependencies + shell: bash + run: | + uv sync --all-groups + uv pip install ollama faiss-cpu + # always test against the latest version of the client + # TODO: this is not necessarily a good idea. we need to test against both published and latest + # to find out backwards compatibility issues. + uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main + uv pip install -e . diff --git a/.github/workflows/Dockerfile b/.github/workflows/Dockerfile new file mode 100644 index 000000000..9261bd174 --- /dev/null +++ b/.github/workflows/Dockerfile @@ -0,0 +1 @@ +FROM localhost:5000/distribution-kvant:dev \ No newline at end of file diff --git a/.github/workflows/ci-playground.yaml b/.github/workflows/ci-playground.yaml new file mode 100644 index 000000000..251782855 --- /dev/null +++ b/.github/workflows/ci-playground.yaml @@ -0,0 +1,73 @@ +name: Build and Push playground container +run-name: Build and Push playground container +on: + workflow_dispatch: + #schedule: + # - cron: "0 10 * * *" + push: + branches: + - main + - kvant + tags: + - 'v*' + pull_request: + branches: + - main + - kvant +env: + IMAGE: git.kvant.cloud/${{github.repository}}-playground +jobs: + build-playground: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set current time + uses: https://github.com/gerred/actions/current-time@master + id: current_time + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to git.kvant.cloud registry + uses: docker/login-action@v3 + with: + registry: git.kvant.cloud + username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }} + password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }} + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + # list of Docker images to use as base name for tags + images: | + ${{env.IMAGE}} + # generate Docker tags based on the following events/attributes + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=ref,event=tag + type=semver,pattern={{version}} + + - name: Build and push to gitea registry + uses: docker/build-push-action@v6 + with: + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + context: . + file: llama_stack/distribution/ui/Containerfile + provenance: mode=max + sbom: true + build-args: | + BUILD_DATE=${{ steps.current_time.outputs.time }} + cache-from: | + type=registry,ref=${{ env.IMAGE }}:buildcache + type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }} + type=registry,ref=${{ env.IMAGE }}:main + cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..87f196cc2 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,98 @@ +name: Build and Push container +run-name: Build and Push container +on: + workflow_dispatch: + #schedule: + # - cron: "0 10 * * *" + push: + branches: + - main + - kvant + tags: + - 'v*' + pull_request: + branches: + - main + - kvant +env: + IMAGE: git.kvant.cloud/${{github.repository}} +jobs: + build: + runs-on: ubuntu-latest + services: + registry: + image: registry:2 + ports: + - 5000:5000 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set current time + uses: https://github.com/gerred/actions/current-time@master + id: current_time + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host + + - name: Login to git.kvant.cloud registry + uses: docker/login-action@v3 + with: + registry: git.kvant.cloud + username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }} + password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }} + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + # list of Docker images to use as base name for tags + images: | + ${{env.IMAGE}} + # generate Docker tags based on the following events/attributes + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=ref,event=tag + type=semver,pattern={{version}} + + - name: Install uv + uses: https://github.com/astral-sh/setup-uv@v5 + with: + # Install a specific version of uv. + version: "0.7.8" + + - name: Build + env: + USE_COPY_NOT_MOUNT: true + LLAMA_STACK_DIR: . + run: | + uvx --from . llama stack build --template kvant --image-type container + + # docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant + # docker push ${{env.IMAGE}}:kvant + + docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev + docker push localhost:5000/distribution-kvant:dev + + - name: Build and push to gitea registry + uses: docker/build-push-action@v6 + with: + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + context: .github/workflows + provenance: mode=max + sbom: true + build-args: | + BUILD_DATE=${{ steps.current_time.outputs.time }} + cache-from: | + type=registry,ref=${{ env.IMAGE }}:buildcache + type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }} + type=registry,ref=${{ env.IMAGE }}:main + cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true diff --git a/.github/workflows/changelog.yml b/.github/workflows_upstream/changelog.yml similarity index 100% rename from .github/workflows/changelog.yml rename to .github/workflows_upstream/changelog.yml diff --git a/.github/workflows/gha_workflow_llama_stack_tests.yml b/.github/workflows_upstream/gha_workflow_llama_stack_tests.yml similarity index 100% rename from .github/workflows/gha_workflow_llama_stack_tests.yml rename to .github/workflows_upstream/gha_workflow_llama_stack_tests.yml diff --git a/.github/workflows_upstream/install-script-ci.yml b/.github/workflows_upstream/install-script-ci.yml new file mode 100644 index 000000000..2eb234c77 --- /dev/null +++ b/.github/workflows_upstream/install-script-ci.yml @@ -0,0 +1,26 @@ +name: Installer CI + +on: + pull_request: + paths: + - 'install.sh' + push: + paths: + - 'install.sh' + schedule: + - cron: '0 2 * * *' # every day at 02:00 UTC + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 + - name: Run ShellCheck on install.sh + run: shellcheck install.sh + smoke-test: + needs: lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 + - name: Run installer end-to-end + run: ./install.sh diff --git a/.github/workflows_upstream/integration-auth-tests.yml b/.github/workflows_upstream/integration-auth-tests.yml new file mode 100644 index 000000000..a3a746246 --- /dev/null +++ b/.github/workflows_upstream/integration-auth-tests.yml @@ -0,0 +1,132 @@ +name: Integration Auth Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + paths: + - 'distributions/**' + - 'llama_stack/**' + - 'tests/integration/**' + - 'uv.lock' + - 'pyproject.toml' + - 'requirements.txt' + - '.github/workflows/integration-auth-tests.yml' # This workflow + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-matrix: + runs-on: ubuntu-latest + strategy: + matrix: + auth-provider: [oauth2_token] + fail-fast: false # we want to run all tests regardless of failure + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install dependencies + uses: ./.github/actions/setup-runner + + - name: Build Llama Stack + run: | + llama stack build --template ollama --image-type venv + + - name: Install minikube + if: ${{ matrix.auth-provider == 'kubernetes' }} + uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19 + + - name: Start minikube + if: ${{ matrix.auth-provider == 'oauth2_token' }} + run: | + minikube start + kubectl get pods -A + + - name: Configure Kube Auth + if: ${{ matrix.auth-provider == 'oauth2_token' }} + run: | + kubectl create namespace llama-stack + kubectl create serviceaccount llama-stack-auth -n llama-stack + kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack + kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token + cat <> $GITHUB_ENV + echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV + echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV + echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV + + - name: Set Kube Auth Config and run server + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + if: ${{ matrix.auth-provider == 'oauth2_token' }} + run: | + run_dir=$(mktemp -d) + cat <<'EOF' > $run_dir/run.yaml + version: '2' + image_name: kube + apis: [] + providers: {} + server: + port: 8321 + EOF + yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml + yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml + yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml + cat $run_dir/run.yaml + + nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 & + + - name: Wait for Llama Stack server to be ready + run: | + echo "Waiting for Llama Stack server..." + for i in {1..30}; do + if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then + echo "Llama Stack server is up!" + if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then + echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth" + exit 0 + else + echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth" + cat server.log + exit 1 + fi + fi + sleep 1 + done + echo "Llama Stack server failed to start" + cat server.log + exit 1 + + - name: Test auth + run: | + curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq diff --git a/.github/workflows/integration-tests.yml b/.github/workflows_upstream/integration-tests.yml similarity index 67% rename from .github/workflows/integration-tests.yml rename to .github/workflows_upstream/integration-tests.yml index f54bed839..d78e82c9d 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows_upstream/integration-tests.yml @@ -24,7 +24,7 @@ jobs: matrix: # Listing tests manually since some of them currently fail # TODO: generate matrix list from tests/integration when fixed - test-type: [agents, inference, datasets, inspect, scoring, post_training, providers] + test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime] client-type: [library, http] fail-fast: false # we want to run all tests regardless of failure @@ -32,30 +32,14 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Install uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - with: - python-version: "3.10" + - name: Install dependencies + uses: ./.github/actions/setup-runner - - name: Install and start Ollama - run: | - # the ollama installer also starts the ollama service - curl -fsSL https://ollama.com/install.sh | sh + - name: Setup ollama + uses: ./.github/actions/setup-ollama - - name: Pull Ollama image + - name: Build Llama Stack run: | - # TODO: cache the model. OLLAMA_MODELS defaults to ~ollama/.ollama/models. - ollama pull llama3.2:3b-instruct-fp16 - - - name: Set Up Environment and Install Dependencies - run: | - uv sync --extra dev --extra test - uv pip install ollama faiss-cpu - # always test against the latest version of the client - # TODO: this is not necessarily a good idea. we need to test against both published and latest - # to find out backwards compatibility issues. - uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main - uv pip install -e . llama stack build --template ollama --image-type venv - name: Start Llama Stack server in background @@ -63,8 +47,7 @@ jobs: env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" run: | - source .venv/bin/activate - nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 & + LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv & - name: Wait for Llama Stack server to be ready if: matrix.client-type == 'http' @@ -92,6 +75,12 @@ jobs: exit 1 fi + - name: Check Storage and Memory Available Before Tests + if: ${{ always() }} + run: | + free -h + df -h + - name: Run Integration Tests env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" @@ -101,7 +90,27 @@ jobs: else stack_config="http://localhost:8321" fi - uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ + uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ --text-model="meta-llama/Llama-3.2-3B-Instruct" \ --embedding-model=all-MiniLM-L6-v2 + + - name: Check Storage and Memory Available After Tests + if: ${{ always() }} + run: | + free -h + df -h + + - name: Write ollama logs to file + if: ${{ always() }} + run: | + sudo journalctl -u ollama.service > ollama.log + + - name: Upload all logs to artifacts + if: ${{ always() }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }} + path: | + *.log + retention-days: 1 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows_upstream/pre-commit.yml similarity index 87% rename from .github/workflows/pre-commit.yml rename to .github/workflows_upstream/pre-commit.yml index 17a42dd26..2bbd52c53 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows_upstream/pre-commit.yml @@ -18,7 +18,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: '3.11' cache: pip @@ -27,6 +27,9 @@ jobs: .pre-commit-config.yaml - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + env: + SKIP: no-commit-to-branch + RUFF_OUTPUT_FORMAT: github - name: Verify if there are any diff files after pre-commit run: | diff --git a/.github/workflows/providers-build.yml b/.github/workflows_upstream/providers-build.yml similarity index 65% rename from .github/workflows/providers-build.yml rename to .github/workflows_upstream/providers-build.yml index 23257d7dc..cf53459b9 100644 --- a/.github/workflows/providers-build.yml +++ b/.github/workflows_upstream/providers-build.yml @@ -50,21 +50,8 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - with: - python-version: "3.10" - - - name: Install LlamaStack - run: | - uv venv - source .venv/bin/activate - uv pip install -e . + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Print build dependencies run: | @@ -79,7 +66,6 @@ jobs: - name: Print dependencies in the image if: matrix.image-type == 'venv' run: | - source test/bin/activate uv pip list build-single-provider: @@ -88,21 +74,8 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - with: - python-version: "3.10" - - - name: Install LlamaStack - run: | - uv venv - source .venv/bin/activate - uv pip install -e . + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Build a single provider run: | @@ -114,27 +87,14 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: '3.10' - - - name: Install uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - with: - python-version: "3.10" - - - name: Install LlamaStack - run: | - uv venv - source .venv/bin/activate - uv pip install -e . + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Build a single provider run: | - yq -i '.image_type = "container"' llama_stack/templates/dev/build.yaml - yq -i '.image_name = "test"' llama_stack/templates/dev/build.yaml - USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/dev/build.yaml + yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml + yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml + USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml - name: Inspect the container image entrypoint run: | @@ -145,3 +105,43 @@ jobs: echo "Entrypoint is not correct" exit 1 fi + + build-ubi9-container-distribution: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install dependencies + uses: ./.github/actions/setup-runner + + - name: Pin template to UBI9 base + run: | + yq -i ' + .image_type = "container" | + .image_name = "ubi9-test" | + .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest" + ' llama_stack/templates/starter/build.yaml + + - name: Build dev container (UBI9) + env: + USE_COPY_NOT_MOUNT: "true" + LLAMA_STACK_DIR: "." + run: | + uv run llama stack build --config llama_stack/templates/starter/build.yaml + + - name: Inspect UBI9 image + run: | + IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1) + entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID) + echo "Entrypoint: $entrypoint" + if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then + echo "Entrypoint is not correct" + exit 1 + fi + + echo "Checking /etc/os-release in $IMAGE_ID" + docker run --rm --entrypoint sh "$IMAGE_ID" -c \ + 'source /etc/os-release && echo "$ID"' \ + | grep -qE '^(rhel|ubi)$' \ + || { echo "Base image is not UBI 9!"; exit 1; } diff --git a/.github/workflows/semantic-pr.yml b/.github/workflows_upstream/semantic-pr.yml similarity index 100% rename from .github/workflows/semantic-pr.yml rename to .github/workflows_upstream/semantic-pr.yml diff --git a/.github/workflows/stale_bot.yml b/.github/workflows_upstream/stale_bot.yml similarity index 100% rename from .github/workflows/stale_bot.yml rename to .github/workflows_upstream/stale_bot.yml diff --git a/.github/workflows/test-external-providers.yml b/.github/workflows_upstream/test-external-providers.yml similarity index 51% rename from .github/workflows/test-external-providers.yml rename to .github/workflows_upstream/test-external-providers.yml index 37f5c45ab..06ab7cf3c 100644 --- a/.github/workflows/test-external-providers.yml +++ b/.github/workflows_upstream/test-external-providers.yml @@ -23,29 +23,10 @@ jobs: # container and point 'uv pip install' to the correct path... steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Install uv - uses: astral-sh/setup-uv@v5 - with: - python-version: "3.10" - - - name: Install Ollama - run: | - curl -fsSL https://ollama.com/install.sh | sh - - - name: Pull Ollama image - run: | - ollama pull llama3.2:3b-instruct-fp16 - - - name: Start Ollama in background - run: | - nohup ollama run llama3.2:3b-instruct-fp16 --keepalive=30m > ollama.log 2>&1 & - - - name: Set Up Environment and Install Dependencies - run: | - uv sync --extra dev --extra test - uv pip install -e . + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Apply image type to config file run: | @@ -59,57 +40,32 @@ jobs: - name: Create provider configuration run: | - mkdir -p /tmp/providers.d/remote/inference - cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /tmp/providers.d/remote/inference/custom_ollama.yaml + mkdir -p /home/runner/.llama/providers.d/remote/inference + cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml - name: Build distro from config file run: | USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml - - name: Wait for Ollama to start - run: | - echo "Waiting for Ollama..." - for i in {1..30}; do - if curl -s http://localhost:11434 | grep -q "Ollama is running"; then - echo "Ollama is running!" - exit 0 - fi - sleep 1 - done - echo "Ollama failed to start" - ollama ps - ollama.log - exit 1 - - name: Start Llama Stack server in background if: ${{ matrix.image-type }} == 'venv' env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" run: | - source ci-test/bin/activate uv run pip list nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 & - name: Wait for Llama Stack server to be ready run: | - echo "Waiting for Llama Stack server..." for i in {1..30}; do - if curl -s http://localhost:8321/v1/health | grep -q "OK"; then - echo "Llama Stack server is up!" - if grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then - echo "Llama Stack server is using custom Ollama provider" - exit 0 - else - echo "Llama Stack server is not using custom Ollama provider" - exit 1 - fi + if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then + echo "Waiting for Llama Stack server to load the provider..." + sleep 1 + else + echo "Provider loaded" + exit 0 fi - sleep 1 done - echo "Llama Stack server failed to start" + echo "Provider failed to load" cat server.log exit 1 - - - name: run inference tests - run: | - uv run pytest -v tests/integration/inference/test_text_inference.py --stack-config="http://localhost:8321" --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2 diff --git a/.github/workflows/tests.yml b/.github/workflows_upstream/tests.yml similarity index 100% rename from .github/workflows/tests.yml rename to .github/workflows_upstream/tests.yml diff --git a/.github/workflows/unit-tests.yml b/.github/workflows_upstream/unit-tests.yml similarity index 73% rename from .github/workflows/unit-tests.yml rename to .github/workflows_upstream/unit-tests.yml index 962141744..fc0459f0f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows_upstream/unit-tests.yml @@ -30,17 +30,11 @@ jobs: - "3.12" - "3.13" steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: ${{ matrix.python }} - - - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - with: - python-version: ${{ matrix.python }} - enable-cache: false + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Run unit tests run: | diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows_upstream/update-readthedocs.yml similarity index 78% rename from .github/workflows/update-readthedocs.yml rename to .github/workflows_upstream/update-readthedocs.yml index 794a727be..981332a77 100644 --- a/.github/workflows/update-readthedocs.yml +++ b/.github/workflows_upstream/update-readthedocs.yml @@ -14,6 +14,8 @@ on: - 'docs/**' - 'pyproject.toml' - '.github/workflows/update-readthedocs.yml' + tags: + - '*' pull_request: branches: - main @@ -35,16 +37,8 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 - with: - python-version: '3.11' - - - name: Install the latest version of uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 - - - name: Sync with uv - run: uv sync --extra docs + - name: Install dependencies + uses: ./.github/actions/setup-runner - name: Build HTML run: | @@ -61,7 +55,10 @@ jobs: response=$(curl -X POST \ -H "Content-Type: application/json" \ - -d "{\"token\": \"$TOKEN\"}" \ + -d "{ + \"token\": \"$TOKEN\", + \"version\": \"$GITHUB_REF_NAME\" + }" \ https://readthedocs.org/api/v2/webhook/llama-stack/289768/) echo "Response: $response" diff --git a/.gitignore b/.gitignore index 0ef25cdf1..747acdc7b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ dev_requirements.txt build .DS_Store llama_stack/configs/* +.cursor/ xcuserdata/ *.hmap .DS_Store @@ -23,3 +24,4 @@ venv/ pytest-report.xml .coverage .python-version +data diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ff3bc1250..aaec469e4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,18 @@ repos: args: ['--maxkb=1000'] - id: end-of-file-fixer exclude: '^(.*\.svg)$' + - id: no-commit-to-branch + - id: check-yaml + args: ["--unsafe"] + - id: detect-private-key + - id: requirements-txt-fixer + - id: mixed-line-ending + args: [--fix=lf] # Forces to replace line ending by LF (line feed) + - id: check-executables-have-shebangs + - id: check-json + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: check-toml - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.5.4 @@ -41,7 +53,7 @@ repos: - black==24.3.0 - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.6.3 + rev: 0.7.8 hooks: - id: uv-lock - id: uv-export @@ -49,6 +61,7 @@ repos: "--frozen", "--no-hashes", "--no-emit-project", + "--no-default-groups", "--output-file=requirements.txt" ] @@ -76,24 +89,29 @@ repos: - id: distro-codegen name: Distribution Template Codegen additional_dependencies: - - uv==0.6.0 - entry: uv run --extra codegen ./scripts/distro_codegen.py + - uv==0.7.8 + entry: uv run --group codegen ./scripts/distro_codegen.py language: python pass_filenames: false require_serial: true files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$ - -- repo: local - hooks: - id: openapi-codegen name: API Spec Codegen additional_dependencies: - - uv==0.6.2 - entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null' + - uv==0.7.8 + entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null' language: python pass_filenames: false require_serial: true files: ^llama_stack/apis/|^docs/openapi_generator/ + - id: check-workflows-use-hashes + name: Check GitHub Actions use SHA-pinned actions + entry: ./scripts/check-workflows-use-hashes.sh + language: system + pass_filenames: false + require_serial: true + always_run: true + files: ^\.github/workflows/.*\.ya?ml$ ci: autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f114dbf9b..461977a6c 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,28 +5,21 @@ # Required version: 2 +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + # Set the OS, Python version and other tools you might need build: os: ubuntu-22.04 tools: python: "3.12" - # You can also specify other tool versions: - # nodejs: "19" - # rust: "1.64" - # golang: "1.19" - -# Build documentation in the "docs/" directory with Sphinx -sphinx: - configuration: docs/source/conf.py - -# Optionally build your docs in additional formats such as PDF and ePub -# formats: -# - pdf -# - epub - -# Optional but recommended, declare the Python requirements required -# to build your documentation -# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html -python: - install: - - requirements: docs/requirements.txt + jobs: + pre_create_environment: + - asdf plugin add uv + - asdf install uv latest + - asdf global uv latest + create_environment: + - uv venv "${READTHEDOCS_VIRTUALENV_PATH}" + install: + - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs diff --git a/CHANGELOG.md b/CHANGELOG.md index 5086094ad..f7644a5af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,75 @@ # Changelog +# v0.2.7 +Published on: 2025-05-16T20:38:10Z + +## Highlights + +This is a small update. But a couple highlights: + +* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece. +* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134 +* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases. + + +--- + +# v0.2.6 +Published on: 2025-05-12T18:06:52Z + + + +--- + +# v0.2.5 +Published on: 2025-05-04T20:16:49Z + + + +--- + +# v0.2.4 +Published on: 2025-04-29T17:26:01Z + +## Highlights + +* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383 +* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852 +* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778 +* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989 +* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058 + + +--- + +# v0.2.3 +Published on: 2025-04-25T22:46:21Z + +## Highlights + +* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works. +* significant improvements and functionality added to the nVIDIA distribution +* many improvements to the test verification suite. +* new inference providers: Ramalama, IBM WatsonX +* many improvements to the Playground UI + + +--- + +# v0.2.2 +Published on: 2025-04-13T01:19:49Z + +## Main changes + +- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server +- OpenAI compatible inference API in progress (@bbrowning) +- Provider verifications (@ehhuang) +- Many updates and fixes to playground +- Several llama4 related fixes + + +--- + # v0.2.1 Published on: 2025-04-05T23:13:00Z @@ -10,10 +80,10 @@ Published on: 2025-04-05T23:13:00Z # v0.2.0 Published on: 2025-04-05T19:04:29Z -## Llama 4 Support - -Checkout more at https://www.llama.com - +## Llama 4 Support + +Checkout more at https://www.llama.com + --- @@ -21,58 +91,58 @@ Checkout more at https://www.llama.com # v0.1.9 Published on: 2025-03-29T00:52:23Z -### Build and Test Agents -* Agents: Entire document context with attachments -* RAG: Documentation with sqlite-vec faiss comparison -* Getting started: Fixes to getting started notebook. - -### Agent Evals and Model Customization -* (**New**) Post-training: Add nemo customizer - -### Better Engineering -* Moved sqlite-vec to non-blocking calls -* Don't return a payload on file delete - - +### Build and Test Agents +* Agents: Entire document context with attachments +* RAG: Documentation with sqlite-vec faiss comparison +* Getting started: Fixes to getting started notebook. + +### Agent Evals and Model Customization +* (**New**) Post-training: Add nemo customizer + +### Better Engineering +* Moved sqlite-vec to non-blocking calls +* Don't return a payload on file delete + + --- # v0.1.8 Published on: 2025-03-24T01:28:50Z -# v0.1.8 Release Notes - -### Build and Test Agents -* Safety: Integrated NVIDIA as a safety provider. -* VectorDB: Added Qdrant as an inline provider. -* Agents: Added support for multiple tool groups in agents. -* Agents: Simplified imports for Agents in client package - - -### Agent Evals and Model Customization -* Introduced DocVQA and IfEval benchmarks. - -### Deploying and Monitoring Agents -* Introduced a Containerfile and image workflow for the Playground. -* Implemented support for Bearer (API Key) authentication. -* Added attribute-based access control for resources. -* Fixes on docker deployments: use --pull always and standardized the default port to 8321 -* Deprecated: /v1/inspect/providers use /v1/providers/ instead - -### Better Engineering -* Consolidated scripts under the ./scripts directory. -* Addressed mypy violations in various modules. -* Added Dependabot scans for Python dependencies. -* Implemented a scheduled workflow to update the changelog automatically. -* Enforced concurrency to reduce CI loads. - - -### New Contributors -* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650 -* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671 -* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698 -* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745 - +# v0.1.8 Release Notes + +### Build and Test Agents +* Safety: Integrated NVIDIA as a safety provider. +* VectorDB: Added Qdrant as an inline provider. +* Agents: Added support for multiple tool groups in agents. +* Agents: Simplified imports for Agents in client package + + +### Agent Evals and Model Customization +* Introduced DocVQA and IfEval benchmarks. + +### Deploying and Monitoring Agents +* Introduced a Containerfile and image workflow for the Playground. +* Implemented support for Bearer (API Key) authentication. +* Added attribute-based access control for resources. +* Fixes on docker deployments: use --pull always and standardized the default port to 8321 +* Deprecated: /v1/inspect/providers use /v1/providers/ instead + +### Better Engineering +* Consolidated scripts under the ./scripts directory. +* Addressed mypy violations in various modules. +* Added Dependabot scans for Python dependencies. +* Implemented a scheduled workflow to update the changelog automatically. +* Enforced concurrency to reduce CI loads. + + +### New Contributors +* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650 +* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671 +* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698 +* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745 + **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8 --- @@ -80,73 +150,73 @@ Published on: 2025-03-24T01:28:50Z # v0.1.7 Published on: 2025-03-14T22:30:51Z -## 0.1.7 Release Notes - -### Build and Test Agents -* Inference: ImageType is now refactored to LlamaStackImageType -* Inference: Added tests to measure TTFT -* Inference: Bring back usage metrics -* Agents: Added endpoint for get agent, list agents and list sessions -* Agents: Automated conversion of type hints in client tool for lite llm format -* Agents: Deprecated ToolResponseMessage in agent.resume API -* Added Provider API for listing and inspecting provider info - -### Agent Evals and Model Customization -* Eval: Added new eval benchmarks Math 500 and BFCL v3 -* Deploy and Monitoring of Agents -* Telemetry: Fix tracing to work across coroutines - -### Better Engineering -* Display code coverage for unit tests -* Updated call sites (inference, tool calls, agents) to move to async non blocking calls -* Unit tests also run on Python 3.11, 3.12, and 3.13 -* Added ollama inference to Integration tests CI -* Improved documentation across examples, testing, CLI, updated providers table ) - - - +## 0.1.7 Release Notes + +### Build and Test Agents +* Inference: ImageType is now refactored to LlamaStackImageType +* Inference: Added tests to measure TTFT +* Inference: Bring back usage metrics +* Agents: Added endpoint for get agent, list agents and list sessions +* Agents: Automated conversion of type hints in client tool for lite llm format +* Agents: Deprecated ToolResponseMessage in agent.resume API +* Added Provider API for listing and inspecting provider info + +### Agent Evals and Model Customization +* Eval: Added new eval benchmarks Math 500 and BFCL v3 +* Deploy and Monitoring of Agents +* Telemetry: Fix tracing to work across coroutines + +### Better Engineering +* Display code coverage for unit tests +* Updated call sites (inference, tool calls, agents) to move to async non blocking calls +* Unit tests also run on Python 3.11, 3.12, and 3.13 +* Added ollama inference to Integration tests CI +* Improved documentation across examples, testing, CLI, updated providers table ) + + + --- # v0.1.6 Published on: 2025-03-08T04:35:08Z -## 0.1.6 Release Notes - -### Build and Test Agents -* Inference: Fixed support for inline vllm provider -* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb) -* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples -* Agent: Unify tools and Python SDK Agents API -* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls -* Agent: Support python functions without @client_tool decorator as client tools -* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format -* VectorIO: MilvusDB support added - -### Agent Evals and Model Customization -* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb) -* Eval: Documentation for eval, scoring, adding new benchmarks -* Eval: Distribution template to run benchmarks on llama & non-llama models -* Eval: Ability to register new custom LLM-as-judge scoring functions -* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details. - -### Deploy and Monitoring of Agents -* Better support for different log levels across all components for better monitoring - -### Better Engineering -* Enhance OpenAPI spec to include Error types across all APIs -* Moved all tests to /tests and created unit tests to run on each PR -* Removed all dependencies on llama-models repo - +## 0.1.6 Release Notes + +### Build and Test Agents +* Inference: Fixed support for inline vllm provider +* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb) +* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples +* Agent: Unify tools and Python SDK Agents API +* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls +* Agent: Support python functions without @client_tool decorator as client tools +* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format +* VectorIO: MilvusDB support added + +### Agent Evals and Model Customization +* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb) +* Eval: Documentation for eval, scoring, adding new benchmarks +* Eval: Distribution template to run benchmarks on llama & non-llama models +* Eval: Ability to register new custom LLM-as-judge scoring functions +* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details. + +### Deploy and Monitoring of Agents +* Better support for different log levels across all components for better monitoring + +### Better Engineering +* Enhance OpenAPI spec to include Error types across all APIs +* Moved all tests to /tests and created unit tests to run on each PR +* Removed all dependencies on llama-models repo + --- # v0.1.5.1 Published on: 2025-02-28T22:37:44Z -## 0.1.5.1 Release Notes -* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328 - +## 0.1.5.1 Release Notes +* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328 + **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1 --- @@ -154,176 +224,176 @@ Published on: 2025-02-28T22:37:44Z # v0.1.5 Published on: 2025-02-28T18:14:01Z -## 0.1.5 Release Notes -### Build Agents -* Inference: Support more non-llama models (openai, anthropic, gemini) -* Inference: Can use the provider's model name in addition to the HF alias -* Inference: Fixed issues with calling tools that weren't specified in the prompt -* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling -* Embeddings: Added support for Nemo retriever embedding models -* Tools: Added support for MCP tools in Ollama Distribution -* Distributions: Added new Groq distribution - -### Customize Models -* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model - -### Monitor agents -* More comprehensive logging of agent steps including client tools -* Telemetry inputs/outputs are now structured and queryable -* Ability to retrieve agents session, turn, step by ids - -### Better Engineering -* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin -* Move most logging to use logger instead of prints -* Completed text /chat-completion and /completion tests - +## 0.1.5 Release Notes +### Build Agents +* Inference: Support more non-llama models (openai, anthropic, gemini) +* Inference: Can use the provider's model name in addition to the HF alias +* Inference: Fixed issues with calling tools that weren't specified in the prompt +* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling +* Embeddings: Added support for Nemo retriever embedding models +* Tools: Added support for MCP tools in Ollama Distribution +* Distributions: Added new Groq distribution + +### Customize Models +* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model + +### Monitor agents +* More comprehensive logging of agent steps including client tools +* Telemetry inputs/outputs are now structured and queryable +* Ability to retrieve agents session, turn, step by ids + +### Better Engineering +* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin +* Move most logging to use logger instead of prints +* Completed text /chat-completion and /completion tests + --- # v0.1.4 Published on: 2025-02-25T00:02:43Z -## v0.1.4 Release Notes -Here are the key changes coming as part of this release: - -### Build and Test Agents -* Inference: Added support for non-llama models -* Inference: Added option to list all downloaded models and remove models -* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn -* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides -* Agent: Added logging for agent step start and completion times -* Agent: Added support for logging for tool execution metadata -* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs -* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults -* VectorIO: Improved performance of sqlite-vec using chunked writes -### Agent Evals and Model Customization -* Deprecated api /eval-tasks. Use /eval/benchmark instead -* Added CPU training support for TorchTune -### Deploy and Monitoring of Agents -* Consistent view of client and server tool calls in telemetry -### Better Engineering -* Made tests more data-driven for consistent evaluation -* Fixed documentation links and improved API reference generation -* Various small fixes for build scripts and system reliability - - +## v0.1.4 Release Notes +Here are the key changes coming as part of this release: + +### Build and Test Agents +* Inference: Added support for non-llama models +* Inference: Added option to list all downloaded models and remove models +* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn +* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides +* Agent: Added logging for agent step start and completion times +* Agent: Added support for logging for tool execution metadata +* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs +* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults +* VectorIO: Improved performance of sqlite-vec using chunked writes +### Agent Evals and Model Customization +* Deprecated api /eval-tasks. Use /eval/benchmark instead +* Added CPU training support for TorchTune +### Deploy and Monitoring of Agents +* Consistent view of client and server tool calls in telemetry +### Better Engineering +* Made tests more data-driven for consistent evaluation +* Fixed documentation links and improved API reference generation +* Various small fixes for build scripts and system reliability + + --- # v0.1.3 Published on: 2025-02-14T20:24:32Z -## v0.1.3 Release - -Here are some key changes that are coming as part of this release. - -### Build and Test Agents -Streamlined the initial development experience -- Added support for llama stack run --image-type venv -- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration -- vLLM improvements for tool calling and logprobs -- Better handling of sporadic code_interpreter tool calls - -### Agent Evals -Better benchmarking and Agent performance assessment -- Renamed eval API /eval-task to /benchmarks -- Improved documentation and notebooks for RAG and evals - -### Deploy and Monitoring of Agents -Improved production readiness -- Added usage metrics collection for chat completions -- CLI improvements for provider information -- Improved error handling and system reliability -- Better model endpoint handling and accessibility -- Improved signal handling on distro server - -### Better Engineering -Infrastructure and code quality improvements -- Faster text-based chat completion tests -- Improved testing for non-streaming agent apis -- Standardized import formatting with ruff linter -- Added conventional commits standard -- Fixed documentation parsing issues - +## v0.1.3 Release + +Here are some key changes that are coming as part of this release. + +### Build and Test Agents +Streamlined the initial development experience +- Added support for llama stack run --image-type venv +- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration +- vLLM improvements for tool calling and logprobs +- Better handling of sporadic code_interpreter tool calls + +### Agent Evals +Better benchmarking and Agent performance assessment +- Renamed eval API /eval-task to /benchmarks +- Improved documentation and notebooks for RAG and evals + +### Deploy and Monitoring of Agents +Improved production readiness +- Added usage metrics collection for chat completions +- CLI improvements for provider information +- Improved error handling and system reliability +- Better model endpoint handling and accessibility +- Improved signal handling on distro server + +### Better Engineering +Infrastructure and code quality improvements +- Faster text-based chat completion tests +- Improved testing for non-streaming agent apis +- Standardized import formatting with ruff linter +- Added conventional commits standard +- Fixed documentation parsing issues + --- # v0.1.2 Published on: 2025-02-07T22:06:49Z -# TL;DR -- Several stabilizations to development flows after the switch to `uv` -- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops) -- Added automated rebuilds for ReadTheDocs -- Llama Stack server supports HTTPS -- Added system prompt overrides support -- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan ) - +# TL;DR +- Several stabilizations to development flows after the switch to `uv` +- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops) +- Added automated rebuilds for ReadTheDocs +- Llama Stack server supports HTTPS +- Added system prompt overrides support +- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan ) + --- # v0.1.1 Published on: 2025-02-02T02:29:24Z -A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements. - +A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements. + --- # v0.1.0 Published on: 2025-01-24T17:47:47Z -We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions. - -## Context -GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open. - -Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety. - -With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience. - -## Release -After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements. - -There are example standalone apps in llama-stack-apps. - - -## Key Features of this release - -- **Unified API Layer** - - Inference: Run LLM models - - RAG: Store and retrieve knowledge for RAG - - Agents: Build multi-step agentic workflows - - Tools: Register tools that can be called by the agent - - Safety: Apply content filtering and safety policies - - Evaluation: Test model and agent quality - - Telemetry: Collect and analyze usage data and complex agentic traces - - Post Training ( Coming Soon ): Fine tune models for specific use cases - -- **Rich Provider Ecosystem** - - Local Development: Meta's Reference, Ollama - - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras - - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI - - On-device: iOS and Android support - -- **Built for Production** - - Pre-packaged distributions for common deployment scenarios - - Backwards compatibility across model versions - - Comprehensive evaluation capabilities - - Full observability and monitoring - -- **Multiple developer interfaces** - - CLI: Command line interface - - Python SDK - - Swift iOS SDK - - Kotlin Android SDK - -- **Sample llama stack applications** - - Python - - iOS - - Android - - +We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions. + +## Context +GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open. + +Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety. + +With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience. + +## Release +After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements. + +There are example standalone apps in llama-stack-apps. + + +## Key Features of this release + +- **Unified API Layer** + - Inference: Run LLM models + - RAG: Store and retrieve knowledge for RAG + - Agents: Build multi-step agentic workflows + - Tools: Register tools that can be called by the agent + - Safety: Apply content filtering and safety policies + - Evaluation: Test model and agent quality + - Telemetry: Collect and analyze usage data and complex agentic traces + - Post Training ( Coming Soon ): Fine tune models for specific use cases + +- **Rich Provider Ecosystem** + - Local Development: Meta's Reference, Ollama + - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras + - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI + - On-device: iOS and Android support + +- **Built for Production** + - Pre-packaged distributions for common deployment scenarios + - Backwards compatibility across model versions + - Comprehensive evaluation capabilities + - Full observability and monitoring + +- **Multiple developer interfaces** + - CLI: Command line interface + - Python SDK + - Swift iOS SDK + - Kotlin Android SDK + +- **Sample llama stack applications** + - Python + - iOS + - Android + + --- @@ -337,8 +407,8 @@ Published on: 2025-01-22T22:24:01Z # v0.0.63 Published on: 2024-12-18T07:17:43Z -A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially. - +A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially. + **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63 --- @@ -374,39 +444,39 @@ Published on: 2024-11-22T00:36:09Z # v0.0.53 Published on: 2024-11-20T22:18:00Z -🚀 Initial Release Notes for Llama Stack! - -### Added -- Resource-oriented design for models, shields, memory banks, datasets and eval tasks -- Persistence for registered objects with distribution -- Ability to persist memory banks created for FAISS -- PostgreSQL KVStore implementation -- Environment variable placeholder support in run.yaml files -- Comprehensive Zero-to-Hero notebooks and quickstart guides -- Support for quantized models in Ollama -- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM -- Bedrock distribution with safety shields support -- Evals API with task registration and scoring functions -- MMLU and SimpleQA benchmark scoring functions -- Huggingface dataset provider integration for benchmarks -- Support for custom dataset registration from local paths -- Benchmark evaluation CLI tools with visualization tables -- RAG evaluation scoring functions and metrics -- Local persistence for datasets and eval tasks - -### Changed -- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) -- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) -- Updated API signatures for dataset and eval task registration -- Restructured folder organization for providers -- Enhanced Docker build configuration -- Added version prefixing for REST API routes -- Enhanced evaluation task registration workflow -- Improved benchmark evaluation output formatting -- Restructured evals folder organization for better modularity - -### Removed -- `llama stack configure` command - +🚀 Initial Release Notes for Llama Stack! + +### Added +- Resource-oriented design for models, shields, memory banks, datasets and eval tasks +- Persistence for registered objects with distribution +- Ability to persist memory banks created for FAISS +- PostgreSQL KVStore implementation +- Environment variable placeholder support in run.yaml files +- Comprehensive Zero-to-Hero notebooks and quickstart guides +- Support for quantized models in Ollama +- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM +- Bedrock distribution with safety shields support +- Evals API with task registration and scoring functions +- MMLU and SimpleQA benchmark scoring functions +- Huggingface dataset provider integration for benchmarks +- Support for custom dataset registration from local paths +- Benchmark evaluation CLI tools with visualization tables +- RAG evaluation scoring functions and metrics +- Local persistence for datasets and eval tasks + +### Changed +- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) +- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) +- Updated API signatures for dataset and eval task registration +- Restructured folder organization for providers +- Enhanced Docker build configuration +- Added version prefixing for REST API routes +- Enhanced evaluation task registration workflow +- Improved benchmark evaluation output formatting +- Restructured evals folder organization for better modularity + +### Removed +- `llama stack configure` command + --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5828250d0..10e3f6cee 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -110,25 +110,9 @@ uv run pre-commit run --all-files > [!CAUTION] > Before pushing your changes, make sure that the pre-commit hooks have passed successfully. -## Running unit tests +## Running tests -You can run the unit tests by running: - -```bash -source .venv/bin/activate -./scripts/unit-tests.sh -``` - -If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows: - -``` -source .venv/bin/activate -PYTHON_VERSION=3.13 ./scripts/unit-tests.sh -``` - -## Running integration tests - -You can run integration tests following the instructions [here](tests/integration/README.md). +You can find the Llama Stack testing documentation here [here](tests/README.md). ## Adding a new dependency to the project @@ -141,11 +125,20 @@ uv sync ## Coding Style -* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings. -* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does. -* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`. +* Comments should provide meaningful insights into the code. Avoid filler comments that simply + describe the next step, as they create unnecessary clutter, same goes for docstrings. +* Prefer comments to clarify surprising behavior and/or relationships between parts of the code + rather than explain what the next line of code does. +* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like + `Exception`. * Error messages should be prefixed with "Failed to ..." -* 4 spaces for indentation rather than tabs +* 4 spaces for indentation rather than tab +* When using `# noqa` to suppress a style or linter warning, include a comment explaining the + justification for bypassing the check. +* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the + justification for bypassing the check. +* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or + readability reasons. ## Common Tasks @@ -174,14 +167,11 @@ If you have made changes to a provider's configuration in any form (introducing If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme. ```bash -cd docs -uv sync --extra docs - # This rebuilds the documentation pages. -uv run make html +uv run --group docs make -C docs/ html # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation. -uv run sphinx-autobuild source build/html --write-all +uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all ``` ### Update API Documentation @@ -189,7 +179,7 @@ uv run sphinx-autobuild source build/html --write-all If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command: ```bash -uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh +uv run ./docs/openapi_generator/run_openapi_generator.sh ``` The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. diff --git a/MANIFEST.in b/MANIFEST.in index 879a9cbd4..88bd11767 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ include pyproject.toml -include llama_stack/templates/dependencies.json include llama_stack/models/llama/llama3/tokenizer.model include llama_stack/models/llama/llama4/tokenizer.model include llama_stack/distribution/*.sh diff --git a/README.md b/README.md index 9a4f1a849..37f1aa0f3 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain) [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain) -[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) +[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack) ### ✨🎉 Llama 4 Support 🎉✨ We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. @@ -70,6 +70,13 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w +### 🚀 One-Line Installer 🚀 + +To try Llama Stack locally, run: + +```bash +curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh +``` ### Overview @@ -100,26 +107,29 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on ### API Providers Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack. -| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | -|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:| -| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | -| SambaNova | Hosted | | ✅ | | | | -| Cerebras | Hosted | | ✅ | | | | -| Fireworks | Hosted | ✅ | ✅ | ✅ | | | -| AWS Bedrock | Hosted | | ✅ | | ✅ | | -| Together | Hosted | ✅ | ✅ | | ✅ | | -| Groq | Hosted | | ✅ | | | | -| Ollama | Single Node | | ✅ | | | | -| TGI | Hosted and Single Node | | ✅ | | | | -| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | -| Chroma | Single Node | | | ✅ | | | -| PG Vector | Single Node | | | ✅ | | | -| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | -| vLLM | Hosted and Single Node | | ✅ | | | | -| OpenAI | Hosted | | ✅ | | | | -| Anthropic | Hosted | | ✅ | | | | -| Gemini | Hosted | | ✅ | | | | -| watsonx | Hosted | | ✅ | | | | +| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** | +|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:| +| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | | +| SambaNova | Hosted | | ✅ | | ✅ | | | +| Cerebras | Hosted | | ✅ | | | | | +| Fireworks | Hosted | ✅ | ✅ | ✅ | | | | +| AWS Bedrock | Hosted | | ✅ | | ✅ | | | +| Together | Hosted | ✅ | ✅ | | ✅ | | | +| Groq | Hosted | | ✅ | | | | | +| Ollama | Single Node | | ✅ | | | | | +| TGI | Hosted and Single Node | | ✅ | | | | | +| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | | +| Chroma | Single Node | | | ✅ | | | | +| PG Vector | Single Node | | | ✅ | | | | +| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | +| vLLM | Hosted and Single Node | | ✅ | | | | | +| OpenAI | Hosted | | ✅ | | | | | +| Anthropic | Hosted | | ✅ | | | | | +| Gemini | Hosted | | ✅ | | | | | +| watsonx | Hosted | | ✅ | | | | | +| HuggingFace | Single Node | | | | | | ✅ | +| TorchTune | Single Node | | | | | | ✅ | +| NVIDIA NEMO | Hosted | | | | | | ✅ | ### Distributions diff --git a/docs/_static/css/my_theme.css b/docs/_static/css/my_theme.css index a587f866d..d078ec057 100644 --- a/docs/_static/css/my_theme.css +++ b/docs/_static/css/my_theme.css @@ -27,3 +27,9 @@ pre { white-space: pre-wrap !important; word-break: break-all; } + +[data-theme="dark"] .mermaid { + background-color: #f4f4f6 !important; + border-radius: 6px; + padding: 0.5em; + } diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 4c5393947..d88462909 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -62,11 +62,12 @@ "tags": [ "DatasetIO" ], - "description": "", + "description": "Append rows to a dataset.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to append the rows to.", "required": true, "schema": { "type": "string" @@ -89,7 +90,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A BatchChatCompletionResponse with the full completions.", "content": { "application/json": { "schema": { @@ -114,7 +115,7 @@ "tags": [ "Inference" ], - "description": "", + "description": "Generate chat completions for a batch of messages using the specified model.", "parameters": [], "requestBody": { "content": { @@ -132,7 +133,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A BatchCompletionResponse with the full completions.", "content": { "application/json": { "schema": { @@ -157,7 +158,7 @@ "tags": [ "Inference" ], - "description": "", + "description": "Generate completions for a batch of content using the specified model.", "parameters": [], "requestBody": { "content": { @@ -193,7 +194,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Cancel a training job.", "parameters": [], "requestBody": { "content": { @@ -211,7 +212,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk", + "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -259,7 +260,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk", + "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -307,11 +308,11 @@ "get": { "responses": { "200": { - "description": "A ListAgentsResponse.", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListAgentsResponse" + "$ref": "#/components/schemas/PaginatedResponse" } } } @@ -333,7 +334,26 @@ "Agents" ], "description": "List all agents.", - "parameters": [] + "parameters": [ + { + "name": "start_index", + "in": "query", + "description": "The index to start the pagination from.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "description": "The number of agents to return.", + "required": false, + "schema": { + "type": "integer" + } + } + ] }, "post": { "responses": { @@ -434,7 +454,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a Turn object. If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk", + "description": "If stream=False, returns a Turn object. If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -497,11 +517,127 @@ } } }, + "/v1/openai/v1/responses": { + "get": { + "responses": { + "200": { + "description": "A ListOpenAIResponseObject.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListOpenAIResponseObject" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "List all OpenAI responses.", + "parameters": [ + { + "name": "after", + "in": "query", + "description": "The ID of the last response to return.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "limit", + "in": "query", + "description": "The number of responses to return.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "model", + "in": "query", + "description": "The model to filter responses by.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "order", + "in": "query", + "description": "The order to sort responses by when sorted by created_at ('asc' or 'desc').", + "required": false, + "schema": { + "$ref": "#/components/schemas/Order" + } + } + ] + }, + "post": { + "responses": { + "200": { + "description": "An OpenAIResponseObject.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAIResponseObject" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/OpenAIResponseObjectStream" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "Create a new OpenAI response.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateOpenaiResponseRequest" + } + } + }, + "required": true + } + } + }, "/v1/files": { "get": { "responses": { "200": { - "description": "OK", + "description": "A ListBucketResponse.", "content": { "application/json": { "schema": { @@ -531,6 +667,7 @@ { "name": "bucket", "in": "query", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -541,7 +678,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A FileUploadResponse.", "content": { "application/json": { "schema": { @@ -643,7 +780,7 @@ "tags": [ "Agents" ], - "description": "Delete an agent by its ID.", + "description": "Delete an agent by its ID and its associated sessions and turns.", "parameters": [ { "name": "agent_id", @@ -661,7 +798,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Session.", "content": { "application/json": { "schema": { @@ -741,7 +878,7 @@ "tags": [ "Agents" ], - "description": "Delete an agent session by its ID.", + "description": "Delete an agent session by its ID and its associated turns.", "parameters": [ { "name": "session_id", @@ -768,7 +905,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A FileResponse.", "content": { "application/json": { "schema": { @@ -798,7 +935,7 @@ { "name": "bucket", "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -807,7 +944,7 @@ { "name": "key", "in": "path", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)", + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).", "required": true, "schema": { "type": "string" @@ -841,7 +978,7 @@ { "name": "bucket", "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -850,7 +987,7 @@ { "name": "key", "in": "path", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)", + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).", "required": true, "schema": { "type": "string" @@ -863,7 +1000,7 @@ "post": { "responses": { "200": { - "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}", + "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.", "content": { "application/json": { "schema": { @@ -906,7 +1043,7 @@ "post": { "responses": { "200": { - "description": "EvaluateResponse object containing generations and scores", + "description": "EvaluateResponse object containing generations and scores.", "content": { "application/json": { "schema": { @@ -1090,7 +1227,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Benchmark.", "content": { "application/json": { "schema": { @@ -1115,11 +1252,55 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Get a benchmark by its ID.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to get.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/openai/v1/chat/completions/{completion_id}": { + "get": { + "responses": { + "200": { + "description": "A OpenAICompletionWithInputMessages.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAICompletionWithInputMessages" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Describe a chat completion by its ID.", + "parameters": [ + { + "name": "completion_id", + "in": "path", + "description": "ID of the chat completion.", "required": true, "schema": { "type": "string" @@ -1132,7 +1313,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Dataset.", "content": { "application/json": { "schema": { @@ -1157,11 +1338,12 @@ "tags": [ "Datasets" ], - "description": "", + "description": "Get a dataset by its ID.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to get.", "required": true, "schema": { "type": "string" @@ -1190,11 +1372,12 @@ "tags": [ "Datasets" ], - "description": "", + "description": "Unregister a dataset by its ID.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to unregister.", "required": true, "schema": { "type": "string" @@ -1207,7 +1390,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Model.", "content": { "application/json": { "schema": { @@ -1232,11 +1415,12 @@ "tags": [ "Models" ], - "description": "", + "description": "Get a model by its identifier.", "parameters": [ { "name": "model_id", "in": "path", + "description": "The identifier of the model to get.", "required": true, "schema": { "type": "string" @@ -1265,11 +1449,55 @@ "tags": [ "Models" ], - "description": "", + "description": "Unregister a model.", "parameters": [ { "name": "model_id", "in": "path", + "description": "The identifier of the model to unregister.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/openai/v1/responses/{response_id}": { + "get": { + "responses": { + "200": { + "description": "An OpenAIResponseObject.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAIResponseObject" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "Retrieve an OpenAI response by its ID.", + "parameters": [ + { + "name": "response_id", + "in": "path", + "description": "The ID of the OpenAI response to retrieve.", "required": true, "schema": { "type": "string" @@ -1282,7 +1510,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ScoringFn.", "content": { "application/json": { "schema": { @@ -1307,11 +1535,12 @@ "tags": [ "ScoringFunctions" ], - "description": "", + "description": "Get a scoring function by its ID.", "parameters": [ { "name": "scoring_fn_id", "in": "path", + "description": "The ID of the scoring function to get.", "required": true, "schema": { "type": "string" @@ -1324,7 +1553,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Shield.", "content": { "application/json": { "schema": { @@ -1349,11 +1578,12 @@ "tags": [ "Shields" ], - "description": "", + "description": "Get a shield by its identifier.", "parameters": [ { "name": "identifier", "in": "path", + "description": "The identifier of the shield to get.", "required": true, "schema": { "type": "string" @@ -1366,7 +1596,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Span.", "content": { "application/json": { "schema": { @@ -1391,11 +1621,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a span by its ID.", "parameters": [ { "name": "trace_id", "in": "path", + "description": "The ID of the trace to get the span from.", "required": true, "schema": { "type": "string" @@ -1404,6 +1635,7 @@ { "name": "span_id", "in": "path", + "description": "The ID of the span to get.", "required": true, "schema": { "type": "string" @@ -1416,7 +1648,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QuerySpanTreeResponse.", "content": { "application/json": { "schema": { @@ -1441,11 +1673,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a span tree by its ID.", "parameters": [ { "name": "span_id", "in": "path", + "description": "The ID of the span to get the tree from.", "required": true, "schema": { "type": "string" @@ -1468,7 +1701,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Tool.", "content": { "application/json": { "schema": { @@ -1493,11 +1726,12 @@ "tags": [ "ToolGroups" ], - "description": "", + "description": "Get a tool by its name.", "parameters": [ { "name": "tool_name", "in": "path", + "description": "The name of the tool to get.", "required": true, "schema": { "type": "string" @@ -1510,7 +1744,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ToolGroup.", "content": { "application/json": { "schema": { @@ -1535,11 +1769,12 @@ "tags": [ "ToolGroups" ], - "description": "", + "description": "Get a tool group by its ID.", "parameters": [ { "name": "toolgroup_id", "in": "path", + "description": "The ID of the tool group to get.", "required": true, "schema": { "type": "string" @@ -1568,11 +1803,12 @@ "tags": [ "ToolGroups" ], - "description": "Unregister a tool group", + "description": "Unregister a tool group.", "parameters": [ { "name": "toolgroup_id", "in": "path", + "description": "The ID of the tool group to unregister.", "required": true, "schema": { "type": "string" @@ -1585,7 +1821,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Trace.", "content": { "application/json": { "schema": { @@ -1610,11 +1846,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a trace by its ID.", "parameters": [ { "name": "trace_id", "in": "path", + "description": "The ID of the trace to get.", "required": true, "schema": { "type": "string" @@ -1627,7 +1864,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJobArtifactsResponse.", "content": { "application/json": { "schema": { @@ -1652,11 +1889,12 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get the artifacts of a training job.", "parameters": [ { "name": "job_uuid", "in": "query", + "description": "The UUID of the job to get the artifacts of.", "required": true, "schema": { "type": "string" @@ -1669,7 +1907,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJobStatusResponse.", "content": { "application/json": { "schema": { @@ -1694,11 +1932,12 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get the status of a training job.", "parameters": [ { "name": "job_uuid", "in": "query", + "description": "The UUID of the job to get the status of.", "required": true, "schema": { "type": "string" @@ -1711,7 +1950,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ListPostTrainingJobsResponse.", "content": { "application/json": { "schema": { @@ -1736,7 +1975,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get all training jobs.", "parameters": [] } }, @@ -1744,7 +1983,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A FileUploadResponse.", "content": { "application/json": { "schema": { @@ -1769,12 +2008,12 @@ "tags": [ "Files" ], - "description": "Returns information about an existsing upload session", + "description": "Returns information about an existsing upload session.", "parameters": [ { "name": "upload_id", "in": "path", - "description": "ID of the upload session", + "description": "ID of the upload session.", "required": true, "schema": { "type": "string" @@ -1785,7 +2024,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A FileResponse or None if the upload is not complete.", "content": { "application/json": { "schema": { @@ -1822,7 +2061,7 @@ { "name": "upload_id", "in": "path", - "description": "ID of the upload session", + "description": "ID of the upload session.", "required": true, "schema": { "type": "string" @@ -1846,7 +2085,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A VectorDB.", "content": { "application/json": { "schema": { @@ -1871,11 +2110,12 @@ "tags": [ "VectorDBs" ], - "description": "", + "description": "Get a vector database by its identifier.", "parameters": [ { "name": "vector_db_id", "in": "path", + "description": "The identifier of the vector database to get.", "required": true, "schema": { "type": "string" @@ -1904,11 +2144,12 @@ "tags": [ "VectorDBs" ], - "description": "", + "description": "Unregister a vector database.", "parameters": [ { "name": "vector_db_id", "in": "path", + "description": "The identifier of the vector database to unregister.", "required": true, "schema": { "type": "string" @@ -1921,7 +2162,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A HealthInfo.", "content": { "application/json": { "schema": { @@ -1946,7 +2187,7 @@ "tags": [ "Inspect" ], - "description": "", + "description": "Get the health of the service.", "parameters": [] } }, @@ -2008,7 +2249,7 @@ "tags": [ "VectorIO" ], - "description": "", + "description": "Insert chunks into a vector database.", "parameters": [], "requestBody": { "content": { @@ -2026,7 +2267,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ProviderInfo object containing the provider's details.", "content": { "application/json": { "schema": { @@ -2051,11 +2292,12 @@ "tags": [ "Providers" ], - "description": "", + "description": "Get detailed information about a specific provider.", "parameters": [ { "name": "provider_id", "in": "path", + "description": "The ID of the provider to inspect.", "required": true, "schema": { "type": "string" @@ -2068,7 +2310,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A ToolInvocationResult.", "content": { "application/json": { "schema": { @@ -2093,7 +2335,7 @@ "tags": [ "ToolRuntime" ], - "description": "Run a tool with the given arguments", + "description": "Run a tool with the given arguments.", "parameters": [], "requestBody": { "content": { @@ -2111,7 +2353,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { @@ -2136,7 +2378,7 @@ "tags": [ "DatasetIO" ], - "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set", + "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page.\n- has_more: Whether there are more items available after this set.", "parameters": [ { "name": "dataset_id", @@ -2172,7 +2414,7 @@ "get": { "responses": { "200": { - "description": "The status of the evaluationjob.", + "description": "The status of the evaluation job.", "content": { "application/json": { "schema": { @@ -2319,11 +2561,11 @@ "get": { "responses": { "200": { - "description": "A ListAgentSessionsResponse.", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListAgentSessionsResponse" + "$ref": "#/components/schemas/PaginatedResponse" } } } @@ -2354,6 +2596,24 @@ "schema": { "type": "string" } + }, + { + "name": "start_index", + "in": "query", + "description": "The index to start the pagination from.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "description": "The number of sessions to return.", + "required": false, + "schema": { + "type": "integer" + } } ] } @@ -2362,7 +2622,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ListBenchmarksResponse.", "content": { "application/json": { "schema": { @@ -2387,7 +2647,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "List all benchmarks.", "parameters": [] }, "post": { @@ -2411,7 +2671,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Register a benchmark.", "parameters": [], "requestBody": { "content": { @@ -2425,678 +2685,79 @@ } } }, - "/v1/datasets": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListDatasetsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Datasets" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Dataset" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Datasets" - ], - "description": "Register a new dataset.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterDatasetRequest" - } - } - }, - "required": true - } - } - }, - "/v1/files/{bucket}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListFileResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Files" - ], - "description": "List all files in a bucket.", - "parameters": [ - { - "name": "bucket", - "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/models": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListModelsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Models" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Model" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Models" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterModelRequest" - } - } - }, - "required": true - } - } - }, - "/v1/providers": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListProvidersResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Providers" - ], - "description": "", - "parameters": [] - } - }, - "/v1/inspect/routes": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListRoutesResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Inspect" - ], - "description": "", - "parameters": [] - } - }, - "/v1/tool-runtime/list-tools": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolDefsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolRuntime" - ], - "description": "", - "parameters": [ - { - "name": "tool_group_id", - "in": "query", - "required": false, - "schema": { - "type": "string" - } - }, - { - "name": "mcp_endpoint", - "in": "query", - "required": false, - "schema": { - "$ref": "#/components/schemas/URL" - } - } - ] - } - }, - "/v1/scoring-functions": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListScoringFunctionsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterScoringFunctionRequest" - } - } - }, - "required": true - } - } - }, - "/v1/shields": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListShieldsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Shields" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Shield" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Shields" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterShieldRequest" - } - } - }, - "required": true - } - } - }, - "/v1/toolgroups": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolGroupsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "List tool groups with optional provider", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "Register a tool group", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterToolGroupRequest" - } - } - }, - "required": true - } - } - }, - "/v1/tools": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "List tools with optional tool group", - "parameters": [ - { - "name": "toolgroup_id", - "in": "query", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/vector-dbs": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListVectorDBsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "VectorDBs" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/VectorDB" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "VectorDBs" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterVectorDbRequest" - } - } - }, - "required": true - } - } - }, - "/v1/telemetry/events": { - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Telemetry" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/LogEventRequest" - } - } - }, - "required": true - } - } - }, "/v1/openai/v1/chat/completions": { + "get": { + "responses": { + "200": { + "description": "A ListOpenAIChatCompletionResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListOpenAIChatCompletionResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "List all chat completions.", + "parameters": [ + { + "name": "after", + "in": "query", + "description": "The ID of the last chat completion to return.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "limit", + "in": "query", + "description": "The maximum number of chat completions to return.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "model", + "in": "query", + "description": "The model to filter by.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "order", + "in": "query", + "description": "The order to sort the chat completions by: \"asc\" or \"desc\". Defaults to \"desc\".", + "required": false, + "schema": { + "$ref": "#/components/schemas/Order" + } + } + ] + }, "post": { "responses": { "200": { - "description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.", + "description": "An OpenAIChatCompletion.", "content": { "application/json": { "schema": { @@ -3142,11 +2803,772 @@ } } }, + "/v1/datasets": { + "get": { + "responses": { + "200": { + "description": "A ListDatasetsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListDatasetsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Datasets" + ], + "description": "List all datasets.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Dataset.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Dataset" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Datasets" + ], + "description": "Register a new dataset.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterDatasetRequest" + } + } + }, + "required": true + } + } + }, + "/v1/files/{bucket}": { + "get": { + "responses": { + "200": { + "description": "A ListFileResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListFileResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Files" + ], + "description": "List all files in a bucket.", + "parameters": [ + { + "name": "bucket", + "in": "path", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/models": { + "get": { + "responses": { + "200": { + "description": "A ListModelsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListModelsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Models" + ], + "description": "List all models.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Model.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Model" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Models" + ], + "description": "Register a model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterModelRequest" + } + } + }, + "required": true + } + } + }, + "/v1/openai/v1/responses/{response_id}/input_items": { + "get": { + "responses": { + "200": { + "description": "An ListOpenAIResponseInputItem.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListOpenAIResponseInputItem" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "List input items for a given OpenAI response.", + "parameters": [ + { + "name": "response_id", + "in": "path", + "description": "The ID of the response to retrieve input items for.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "after", + "in": "query", + "description": "An item ID to list items after, used for pagination.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "before", + "in": "query", + "description": "An item ID to list items before, used for pagination.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "include", + "in": "query", + "description": "Additional fields to include in the response.", + "required": false, + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + }, + { + "name": "limit", + "in": "query", + "description": "A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "order", + "in": "query", + "description": "The order to return the input items in. Default is desc.", + "required": false, + "schema": { + "$ref": "#/components/schemas/Order" + } + } + ] + } + }, + "/v1/providers": { + "get": { + "responses": { + "200": { + "description": "A ListProvidersResponse containing information about all providers.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListProvidersResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Providers" + ], + "description": "List all available providers.", + "parameters": [] + } + }, + "/v1/inspect/routes": { + "get": { + "responses": { + "200": { + "description": "A ListRoutesResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListRoutesResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inspect" + ], + "description": "List all routes.", + "parameters": [] + } + }, + "/v1/tool-runtime/list-tools": { + "get": { + "responses": { + "200": { + "description": "A ListToolDefsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolDefsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolRuntime" + ], + "description": "List all tools in the runtime.", + "parameters": [ + { + "name": "tool_group_id", + "in": "query", + "description": "The ID of the tool group to list tools for.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "mcp_endpoint", + "in": "query", + "description": "The MCP endpoint to use for the tool group.", + "required": false, + "schema": { + "$ref": "#/components/schemas/URL" + } + } + ] + } + }, + "/v1/scoring-functions": { + "get": { + "responses": { + "200": { + "description": "A ListScoringFunctionsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListScoringFunctionsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "List all scoring functions.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "Register a scoring function.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterScoringFunctionRequest" + } + } + }, + "required": true + } + } + }, + "/v1/shields": { + "get": { + "responses": { + "200": { + "description": "A ListShieldsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListShieldsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Shields" + ], + "description": "List all shields.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Shield.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Shield" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Shields" + ], + "description": "Register a shield.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterShieldRequest" + } + } + }, + "required": true + } + } + }, + "/v1/toolgroups": { + "get": { + "responses": { + "200": { + "description": "A ListToolGroupsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolGroupsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "List tool groups with optional provider.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "Register a tool group.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterToolGroupRequest" + } + } + }, + "required": true + } + } + }, + "/v1/tools": { + "get": { + "responses": { + "200": { + "description": "A ListToolsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "List tools with optional tool group.", + "parameters": [ + { + "name": "toolgroup_id", + "in": "query", + "description": "The ID of the tool group to list tools for.", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/vector-dbs": { + "get": { + "responses": { + "200": { + "description": "A ListVectorDBsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListVectorDBsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "VectorDBs" + ], + "description": "List all vector databases.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A VectorDB.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/VectorDB" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "VectorDBs" + ], + "description": "Register a vector database.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterVectorDbRequest" + } + } + }, + "required": true + } + } + }, + "/v1/telemetry/events": { + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Telemetry" + ], + "description": "Log an event.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/LogEventRequest" + } + } + }, + "required": true + } + } + }, "/v1/openai/v1/completions": { "post": { "responses": { "200": { - "description": "OK", + "description": "An OpenAICompletion.", "content": { "application/json": { "schema": { @@ -3185,11 +3607,54 @@ } } }, + "/v1/openai/v1/embeddings": { + "post": { + "responses": { + "200": { + "description": "An OpenAIEmbeddingsResponse containing the embeddings.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAIEmbeddingsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenaiEmbeddingsRequest" + } + } + }, + "required": true + } + } + }, "/v1/openai/v1/models": { "get": { "responses": { "200": { - "description": "OK", + "description": "A OpenAIListModelsResponse.", "content": { "application/json": { "schema": { @@ -3214,7 +3679,7 @@ "tags": [ "Models" ], - "description": "", + "description": "List models using the OpenAI API.", "parameters": [] } }, @@ -3222,7 +3687,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJob.", "content": { "application/json": { "schema": { @@ -3247,7 +3712,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Run preference optimization of a model.", "parameters": [], "requestBody": { "content": { @@ -3308,7 +3773,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QueryChunksResponse.", "content": { "application/json": { "schema": { @@ -3333,7 +3798,7 @@ "tags": [ "VectorIO" ], - "description": "", + "description": "Query chunks from a vector database.", "parameters": [], "requestBody": { "content": { @@ -3347,11 +3812,64 @@ } } }, + "/v1/telemetry/metrics/{metric_name}": { + "post": { + "responses": { + "200": { + "description": "A QueryMetricsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Telemetry" + ], + "description": "Query metrics.", + "parameters": [ + { + "name": "metric_name", + "in": "path", + "description": "The name of the metric to query.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsRequest" + } + } + }, + "required": true + } + } + }, "/v1/telemetry/spans": { "post": { "responses": { "200": { - "description": "OK", + "description": "A QuerySpansResponse.", "content": { "application/json": { "schema": { @@ -3376,7 +3894,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Query spans.", "parameters": [], "requestBody": { "content": { @@ -3394,7 +3912,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QueryTracesResponse.", "content": { "application/json": { "schema": { @@ -3419,7 +3937,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Query traces.", "parameters": [], "requestBody": { "content": { @@ -3566,7 +4084,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A RunShieldResponse.", "content": { "application/json": { "schema": { @@ -3591,7 +4109,7 @@ "tags": [ "Safety" ], - "description": "", + "description": "Run a shield.", "parameters": [], "requestBody": { "content": { @@ -3627,7 +4145,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Save spans to a dataset.", "parameters": [], "requestBody": { "content": { @@ -3645,7 +4163,7 @@ "post": { "responses": { "200": { - "description": "ScoreResponse object containing rows and aggregated results", + "description": "A ScoreResponse object containing rows and aggregated results.", "content": { "application/json": { "schema": { @@ -3688,7 +4206,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A ScoreBatchResponse.", "content": { "application/json": { "schema": { @@ -3713,7 +4231,7 @@ "tags": [ "Scoring" ], - "description": "", + "description": "Score a batch of rows.", "parameters": [], "requestBody": { "content": { @@ -3731,7 +4249,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJob.", "content": { "application/json": { "schema": { @@ -3756,7 +4274,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Run supervised fine-tuning of a model.", "parameters": [], "requestBody": { "content": { @@ -3817,7 +4335,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A VersionInfo.", "content": { "application/json": { "schema": { @@ -3842,7 +4360,7 @@ "tags": [ "Inspect" ], - "description": "", + "description": "Get the version of the service.", "parameters": [] } } @@ -3908,7 +4426,8 @@ } ] } - } + }, + "description": "The rows to append to the dataset." } }, "additionalProperties": false, @@ -3961,9 +4480,13 @@ "properties": { "type": { "type": "string", + "enum": [ + "json_schema", + "grammar" + ], + "description": "Must be \"grammar\" to identify this format type", "const": "grammar", - "default": "grammar", - "description": "Must be \"grammar\" to identify this format type" + "default": "grammar" }, "bnf": { "type": "object", @@ -4087,9 +4610,13 @@ "properties": { "type": { "type": "string", + "enum": [ + "json_schema", + "grammar" + ], + "description": "Must be \"json_schema\" to identify this format type", "const": "json_schema", - "default": "json_schema", - "description": "Must be \"json_schema\" to identify this format type" + "default": "json_schema" }, "json_schema": { "type": "object", @@ -4607,7 +5134,8 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, "messages_batch": { "type": "array", @@ -4616,22 +5144,27 @@ "items": { "$ref": "#/components/schemas/Message" } - } + }, + "description": "The messages to generate completions for." }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy." }, "tools": { "type": "array", "items": { "$ref": "#/components/schemas/ToolDefinition" - } + }, + "description": "(Optional) List of tool definitions available to the model." }, "tool_config": { - "$ref": "#/components/schemas/ToolConfig" + "$ref": "#/components/schemas/ToolConfig", + "description": "(Optional) Configuration for tool use." }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "logprobs": { "type": "object", @@ -4643,7 +5176,7 @@ } }, "additionalProperties": false, - "title": "LogProbConfig" + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -4746,19 +5279,23 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, "content_batch": { "type": "array", "items": { "$ref": "#/components/schemas/InterleavedContent" - } + }, + "description": "The content to generate completions for." }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy." }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "logprobs": { "type": "object", @@ -4770,7 +5307,7 @@ } }, "additionalProperties": false, - "title": "LogProbConfig" + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -4838,7 +5375,8 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to cancel." } }, "additionalProperties": false, @@ -4859,18 +5397,18 @@ "items": { "$ref": "#/components/schemas/Message" }, - "description": "List of messages in the conversation" + "description": "List of messages in the conversation." }, "sampling_params": { "$ref": "#/components/schemas/SamplingParams", - "description": "Parameters to control the sampling strategy" + "description": "Parameters to control the sampling strategy." }, "tools": { "type": "array", "items": { "$ref": "#/components/schemas/ToolDefinition" }, - "description": "(Optional) List of tool definitions available to the model" + "description": "(Optional) List of tool definitions available to the model." }, "tool_choice": { "type": "string", @@ -5090,15 +5628,15 @@ }, "content": { "$ref": "#/components/schemas/InterleavedContent", - "description": "The content to generate a completion for" + "description": "The content to generate a completion for." }, "sampling_params": { "$ref": "#/components/schemas/SamplingParams", - "description": "(Optional) Parameters to control the sampling strategy" + "description": "(Optional) Parameters to control the sampling strategy." }, "response_format": { "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding" + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "stream": { "type": "boolean", @@ -5547,6 +6085,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "inference", "default": "inference" }, @@ -5588,6 +6134,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "memory_retrieval", "default": "memory_retrieval" }, @@ -5676,6 +6230,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "shield_call", "default": "shield_call" }, @@ -5716,6 +6278,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "tool_execution", "default": "tool_execution" }, @@ -5978,6 +6548,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_complete", "default": "step_complete" }, @@ -6035,6 +6614,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_progress", "default": "step_progress" }, @@ -6070,6 +6658,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_start", "default": "step_start" }, @@ -6140,6 +6737,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_awaiting_input", "default": "turn_awaiting_input" }, @@ -6159,6 +6765,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_complete", "default": "turn_complete" }, @@ -6178,6 +6793,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_start", "default": "turn_start" }, @@ -6192,24 +6816,880 @@ ], "title": "AgentTurnResponseTurnStartPayload" }, + "OpenAIResponseInput": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMessage" + } + ] + }, + "OpenAIResponseInputFunctionToolCallOutput": { + "type": "object", + "properties": { + "call_id": { + "type": "string" + }, + "output": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function_call_output", + "default": "function_call_output" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "call_id", + "output", + "type" + ], + "title": "OpenAIResponseInputFunctionToolCallOutput", + "description": "This represents the output of a function call that gets passed back to the model." + }, + "OpenAIResponseInputMessageContent": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText", + "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage" + } + } + }, + "OpenAIResponseInputMessageContentImage": { + "type": "object", + "properties": { + "detail": { + "oneOf": [ + { + "type": "string", + "const": "low" + }, + { + "type": "string", + "const": "high" + }, + { + "type": "string", + "const": "auto" + } + ], + "default": "auto" + }, + "type": { + "type": "string", + "const": "input_image", + "default": "input_image" + }, + "image_url": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "detail", + "type" + ], + "title": "OpenAIResponseInputMessageContentImage" + }, + "OpenAIResponseInputMessageContentText": { + "type": "object", + "properties": { + "text": { + "type": "string" + }, + "type": { + "type": "string", + "const": "input_text", + "default": "input_text" + } + }, + "additionalProperties": false, + "required": [ + "text", + "type" + ], + "title": "OpenAIResponseInputMessageContentText" + }, + "OpenAIResponseInputTool": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFunction" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolMCP" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "web_search": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "file_search": "#/components/schemas/OpenAIResponseInputToolFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolFunction", + "mcp": "#/components/schemas/OpenAIResponseInputToolMCP" + } + } + }, + "OpenAIResponseInputToolFileSearch": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "file_search", + "default": "file_search" + }, + "vector_store_id": { + "type": "array", + "items": { + "type": "string" + } + }, + "ranking_options": { + "type": "object", + "properties": { + "ranker": { + "type": "string" + }, + "score_threshold": { + "type": "number", + "default": 0.0 + } + }, + "additionalProperties": false, + "title": "FileSearchRankingOptions" + } + }, + "additionalProperties": false, + "required": [ + "type", + "vector_store_id" + ], + "title": "OpenAIResponseInputToolFileSearch" + }, + "OpenAIResponseInputToolFunction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "function", + "default": "function" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "strict": { + "type": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type", + "name" + ], + "title": "OpenAIResponseInputToolFunction" + }, + "OpenAIResponseInputToolMCP": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "mcp", + "default": "mcp" + }, + "server_label": { + "type": "string" + }, + "server_url": { + "type": "string" + }, + "headers": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "require_approval": { + "oneOf": [ + { + "type": "string", + "const": "always" + }, + { + "type": "string", + "const": "never" + }, + { + "type": "object", + "properties": { + "always": { + "type": "array", + "items": { + "type": "string" + } + }, + "never": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "title": "ApprovalFilter" + } + ], + "default": "never" + }, + "allowed_tools": { + "oneOf": [ + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "object", + "properties": { + "tool_names": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "title": "AllowedToolsFilter" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "type", + "server_label", + "server_url", + "require_approval" + ], + "title": "OpenAIResponseInputToolMCP" + }, + "OpenAIResponseInputToolWebSearch": { + "type": "object", + "properties": { + "type": { + "oneOf": [ + { + "type": "string", + "const": "web_search" + }, + { + "type": "string", + "const": "web_search_preview_2025_03_11" + } + ], + "default": "web_search" + }, + "search_context_size": { + "type": "string", + "default": "medium" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIResponseInputToolWebSearch" + }, + "OpenAIResponseMessage": { + "type": "object", + "properties": { + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContent" + } + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent" + } + } + ] + }, + "role": { + "oneOf": [ + { + "type": "string", + "const": "system" + }, + { + "type": "string", + "const": "developer" + }, + { + "type": "string", + "const": "user" + }, + { + "type": "string", + "const": "assistant" + } + ] + }, + "type": { + "type": "string", + "const": "message", + "default": "message" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "content", + "role", + "type" + ], + "title": "OpenAIResponseMessage", + "description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios." + }, + "OpenAIResponseOutputMessageContent": { + "type": "object", + "properties": { + "text": { + "type": "string" + }, + "type": { + "type": "string", + "const": "output_text", + "default": "output_text" + } + }, + "additionalProperties": false, + "required": [ + "text", + "type" + ], + "title": "OpenAIResponseOutputMessageContentOutputText" + }, + "OpenAIResponseOutputMessageFunctionToolCall": { + "type": "object", + "properties": { + "call_id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function_call", + "default": "function_call" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "call_id", + "name", + "arguments", + "type" + ], + "title": "OpenAIResponseOutputMessageFunctionToolCall" + }, + "OpenAIResponseOutputMessageWebSearchToolCall": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "status": { + "type": "string" + }, + "type": { + "type": "string", + "const": "web_search_call", + "default": "web_search_call" + } + }, + "additionalProperties": false, + "required": [ + "id", + "status", + "type" + ], + "title": "OpenAIResponseOutputMessageWebSearchToolCall" + }, + "CreateOpenaiResponseRequest": { + "type": "object", + "properties": { + "input": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInput" + } + } + ], + "description": "Input message(s) to create the response." + }, + "model": { + "type": "string", + "description": "The underlying LLM used for completions." + }, + "instructions": { + "type": "string" + }, + "previous_response_id": { + "type": "string", + "description": "(Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses." + }, + "store": { + "type": "boolean" + }, + "stream": { + "type": "boolean" + }, + "temperature": { + "type": "number" + }, + "tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInputTool" + } + } + }, + "additionalProperties": false, + "required": [ + "input", + "model" + ], + "title": "CreateOpenaiResponseRequest" + }, + "OpenAIResponseError": { + "type": "object", + "properties": { + "code": { + "type": "string" + }, + "message": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "code", + "message" + ], + "title": "OpenAIResponseError" + }, + "OpenAIResponseObject": { + "type": "object", + "properties": { + "created_at": { + "type": "integer" + }, + "error": { + "$ref": "#/components/schemas/OpenAIResponseError" + }, + "id": { + "type": "string" + }, + "model": { + "type": "string" + }, + "object": { + "type": "string", + "const": "response", + "default": "response" + }, + "output": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseOutput" + } + }, + "parallel_tool_calls": { + "type": "boolean", + "default": false + }, + "previous_response_id": { + "type": "string" + }, + "status": { + "type": "string" + }, + "temperature": { + "type": "number" + }, + "top_p": { + "type": "number" + }, + "truncation": { + "type": "string" + }, + "user": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "created_at", + "id", + "model", + "object", + "output", + "parallel_tool_calls", + "status" + ], + "title": "OpenAIResponseObject" + }, + "OpenAIResponseOutput": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseMessage" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "message": "#/components/schemas/OpenAIResponseMessage", + "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall", + "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall", + "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", + "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" + } + } + }, + "OpenAIResponseOutputMessageMCPCall": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "mcp_call", + "default": "mcp_call" + }, + "arguments": { + "type": "string" + }, + "name": { + "type": "string" + }, + "server_label": { + "type": "string" + }, + "error": { + "type": "string" + }, + "output": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "id", + "type", + "arguments", + "name", + "server_label" + ], + "title": "OpenAIResponseOutputMessageMCPCall" + }, + "OpenAIResponseOutputMessageMCPListTools": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "mcp_list_tools", + "default": "mcp_list_tools" + }, + "server_label": { + "type": "string" + }, + "tools": { + "type": "array", + "items": { + "type": "object", + "properties": { + "input_schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "input_schema", + "name" + ], + "title": "MCPListToolsTool" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "type", + "server_label", + "tools" + ], + "title": "OpenAIResponseOutputMessageMCPListTools" + }, + "OpenAIResponseObjectStream": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated" + }, + { + "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta" + }, + { + "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated", + "response.output_text.delta": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta", + "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted" + } + } + }, + "OpenAIResponseObjectStreamResponseCompleted": { + "type": "object", + "properties": { + "response": { + "$ref": "#/components/schemas/OpenAIResponseObject" + }, + "type": { + "type": "string", + "const": "response.completed", + "default": "response.completed" + } + }, + "additionalProperties": false, + "required": [ + "response", + "type" + ], + "title": "OpenAIResponseObjectStreamResponseCompleted" + }, + "OpenAIResponseObjectStreamResponseCreated": { + "type": "object", + "properties": { + "response": { + "$ref": "#/components/schemas/OpenAIResponseObject" + }, + "type": { + "type": "string", + "const": "response.created", + "default": "response.created" + } + }, + "additionalProperties": false, + "required": [ + "response", + "type" + ], + "title": "OpenAIResponseObjectStreamResponseCreated" + }, + "OpenAIResponseObjectStreamResponseOutputTextDelta": { + "type": "object", + "properties": { + "content_index": { + "type": "integer" + }, + "delta": { + "type": "string" + }, + "item_id": { + "type": "string" + }, + "output_index": { + "type": "integer" + }, + "sequence_number": { + "type": "integer" + }, + "type": { + "type": "string", + "const": "response.output_text.delta", + "default": "response.output_text.delta" + } + }, + "additionalProperties": false, + "required": [ + "content_index", + "delta", + "item_id", + "output_index", + "sequence_number", + "type" + ], + "title": "OpenAIResponseObjectStreamResponseOutputTextDelta" + }, "CreateUploadSessionRequest": { "type": "object", "properties": { "bucket": { "type": "string", - "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)" + "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)." }, "key": { "type": "string", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)" + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)." }, "mime_type": { "type": "string", - "description": "MIME type of the file" + "description": "MIME type of the file." }, "size": { "type": "integer", - "description": "File size in bytes" + "description": "File size in bytes." } }, "additionalProperties": false, @@ -6361,7 +7841,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "basic", "default": "basic" }, @@ -6374,7 +7854,8 @@ }, "additionalProperties": false, "required": [ - "type" + "type", + "aggregation_functions" ], "title": "BasicScoringFnParams" }, @@ -6426,7 +7907,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "llm_as_judge", "default": "llm_as_judge" }, @@ -6452,7 +7933,9 @@ "additionalProperties": false, "required": [ "type", - "judge_model" + "judge_model", + "judge_score_regexes", + "aggregation_functions" ], "title": "LLMAsJudgeScoringFnParams" }, @@ -6490,7 +7973,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "regex_parser", "default": "regex_parser" }, @@ -6509,7 +7992,9 @@ }, "additionalProperties": false, "required": [ - "type" + "type", + "parsing_regexes", + "aggregation_functions" ], "title": "RegexParserScoringFnParams" }, @@ -6534,6 +8019,15 @@ } } }, + "ScoringFnParamsType": { + "type": "string", + "enum": [ + "llm_as_judge", + "regex_parser", + "basic" + ], + "title": "ScoringFnParamsType" + }, "EvaluateRowsRequest": { "type": "object", "properties": { @@ -6802,6 +8296,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "benchmark", "default": "benchmark" }, @@ -6843,7 +8348,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "dataset_id", @@ -6852,6 +8356,482 @@ ], "title": "Benchmark" }, + "OpenAIAssistantMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the model's response" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the assistant message participant." + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" + }, + "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object." + } + }, + "additionalProperties": false, + "required": [ + "role" + ], + "title": "OpenAIAssistantMessageParam", + "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." + }, + "OpenAIChatCompletionContentPartImageParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "image_url", + "default": "image_url" + }, + "image_url": { + "$ref": "#/components/schemas/OpenAIImageURL" + } + }, + "additionalProperties": false, + "required": [ + "type", + "image_url" + ], + "title": "OpenAIChatCompletionContentPartImageParam" + }, + "OpenAIChatCompletionContentPartParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam", + "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + } + }, + "OpenAIChatCompletionContentPartTextParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "text", + "default": "text" + }, + "text": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "text" + ], + "title": "OpenAIChatCompletionContentPartTextParam" + }, + "OpenAIChatCompletionToolCall": { + "type": "object", + "properties": { + "index": { + "type": "integer" + }, + "id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function", + "default": "function" + }, + "function": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIChatCompletionToolCall" + }, + "OpenAIChatCompletionToolCallFunction": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionToolCallFunction" + }, + "OpenAIChoice": { + "type": "object", + "properties": { + "message": { + "$ref": "#/components/schemas/OpenAIMessageParam", + "description": "The message from the model" + }, + "finish_reason": { + "type": "string", + "description": "The reason the model stopped generating" + }, + "index": { + "type": "integer", + "description": "The index of the choice" + }, + "logprobs": { + "$ref": "#/components/schemas/OpenAIChoiceLogprobs", + "description": "(Optional) The log probabilities for the tokens in the message" + } + }, + "additionalProperties": false, + "required": [ + "message", + "finish_reason", + "index" + ], + "title": "OpenAIChoice", + "description": "A choice from an OpenAI-compatible chat completion response." + }, + "OpenAIChoiceLogprobs": { + "type": "object", + "properties": { + "content": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "description": "(Optional) The log probabilities for the tokens in the message" + }, + "refusal": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "description": "(Optional) The log probabilities for the tokens in the message" + } + }, + "additionalProperties": false, + "title": "OpenAIChoiceLogprobs", + "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." + }, + "OpenAIDeveloperMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "developer", + "default": "developer", + "description": "Must be \"developer\" to identify this as a developer message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the developer message" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the developer message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIDeveloperMessageParam", + "description": "A message from the developer in an OpenAI-compatible chat completion request." + }, + "OpenAIImageURL": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "detail": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "url" + ], + "title": "OpenAIImageURL" + }, + "OpenAIMessageParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIUserMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAISystemMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIToolMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIDeveloperMessageParam" + } + ], + "discriminator": { + "propertyName": "role", + "mapping": { + "user": "#/components/schemas/OpenAIUserMessageParam", + "system": "#/components/schemas/OpenAISystemMessageParam", + "assistant": "#/components/schemas/OpenAIAssistantMessageParam", + "tool": "#/components/schemas/OpenAIToolMessageParam", + "developer": "#/components/schemas/OpenAIDeveloperMessageParam" + } + } + }, + "OpenAISystemMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "system", + "default": "system", + "description": "Must be \"system\" to identify this as a system message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." + }, + "name": { + "type": "string", + "description": "(Optional) The name of the system message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAISystemMessageParam", + "description": "A system message providing instructions or context to the model." + }, + "OpenAITokenLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + }, + "top_logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITopLogProb" + } + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob", + "top_logprobs" + ], + "title": "OpenAITokenLogProb", + "description": "The log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenAIToolMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "tool", + "default": "tool", + "description": "Must be \"tool\" to identify this as a tool response" + }, + "tool_call_id": { + "type": "string", + "description": "Unique identifier for the tool call this response is for" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The response content from the tool" + } + }, + "additionalProperties": false, + "required": [ + "role", + "tool_call_id", + "content" + ], + "title": "OpenAIToolMessageParam", + "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request." + }, + "OpenAITopLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob" + ], + "title": "OpenAITopLogProb", + "description": "The top log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenAIUserMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "user", + "default": "user", + "description": "Must be \"user\" to identify this as a user message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the message, which can include text and other media" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the user message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIUserMessageParam", + "description": "A message from the user in an OpenAI-compatible chat completion request." + }, + "OpenAICompletionWithInputMessages": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion", + "default": "chat.completion", + "description": "The object type, which will be \"chat.completion\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + }, + "input_messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIMessageParam" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model", + "input_messages" + ], + "title": "OpenAICompletionWithInputMessages" + }, "DataSource": { "oneOf": [ { @@ -6883,6 +8863,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "dataset", "default": "dataset" }, @@ -6928,7 +8919,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "purpose", @@ -7058,6 +9048,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "model", "default": "model" }, @@ -7094,7 +9095,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "metadata", @@ -7293,6 +9293,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "scoring_function", "default": "scoring_function" }, @@ -7334,7 +9345,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "metadata", @@ -7386,6 +9396,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "shield", "default": "shield" }, @@ -7418,7 +9439,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type" ], @@ -7490,10 +9510,12 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to return in the tree." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -7598,15 +9620,23 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "tool", "default": "tool" }, "toolgroup_id": { "type": "string" }, - "tool_host": { - "$ref": "#/components/schemas/ToolHost" - }, "description": { "type": "string" }, @@ -7645,25 +9675,14 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "toolgroup_id", - "tool_host", "description", "parameters" ], "title": "Tool" }, - "ToolHost": { - "type": "string", - "enum": [ - "distribution", - "client", - "model_context_protocol" - ], - "title": "ToolHost" - }, "ToolGroup": { "type": "object", "properties": { @@ -7678,6 +9697,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "tool_group", "default": "tool_group" }, @@ -7713,7 +9743,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type" ], @@ -7880,6 +9909,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "vector_db", "default": "vector_db" }, @@ -7893,7 +9933,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "embedding_model", @@ -8015,7 +10054,8 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to insert the chunks into." }, "chunks": { "type": "array", @@ -8023,7 +10063,8 @@ "type": "object", "properties": { "content": { - "$ref": "#/components/schemas/InterleavedContent" + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the chunk, which can be interleaved text, images, or other types." }, "metadata": { "type": "object", @@ -8048,7 +10089,15 @@ "type": "object" } ] - } + }, + "description": "Metadata associated with the chunk, such as document ID, source, or other relevant information." + }, + "embedding": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Optional embedding for the chunk. If not provided, it will be computed later." } }, "additionalProperties": false, @@ -8056,11 +10105,14 @@ "content", "metadata" ], - "title": "Chunk" - } + "title": "Chunk", + "description": "A chunk of content that can be inserted into a vector database." + }, + "description": "The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `embedding` is not provided, it will be computed later." }, "ttl_seconds": { - "type": "integer" + "type": "integer", + "description": "The time to live of the chunks." } }, "additionalProperties": false, @@ -8147,7 +10199,8 @@ "type": "object", "properties": { "tool_name": { - "type": "string" + "type": "string", + "description": "The name of the tool to invoke." }, "kwargs": { "type": "object", @@ -8172,7 +10225,8 @@ "type": "object" } ] - } + }, + "description": "A dictionary of arguments to pass to the tool." } }, "additionalProperties": false, @@ -8293,38 +10347,6 @@ ], "title": "Job" }, - "ListAgentSessionsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Session" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListAgentSessionsResponse" - }, - "ListAgentsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Agent" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListAgentsResponse" - }, "BucketResponse": { "type": "object", "properties": { @@ -8372,6 +10394,91 @@ ], "title": "ListBenchmarksResponse" }, + "Order": { + "type": "string", + "enum": [ + "asc", + "desc" + ], + "title": "Order" + }, + "ListOpenAIChatCompletionResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion", + "default": "chat.completion", + "description": "The object type, which will be \"chat.completion\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + }, + "input_messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIMessageParam" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model", + "input_messages" + ], + "title": "OpenAICompletionWithInputMessages" + } + }, + "has_more": { + "type": "boolean" + }, + "first_id": { + "type": "string" + }, + "last_id": { + "type": "string" + }, + "object": { + "type": "string", + "const": "list", + "default": "list" + } + }, + "additionalProperties": false, + "required": [ + "data", + "has_more", + "first_id", + "last_id", + "object" + ], + "title": "ListOpenAIChatCompletionResponse" + }, "ListDatasetsResponse": { "type": "object", "properties": { @@ -8422,6 +10529,130 @@ ], "title": "ListModelsResponse" }, + "ListOpenAIResponseInputItem": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInput" + } + }, + "object": { + "type": "string", + "const": "list", + "default": "list" + } + }, + "additionalProperties": false, + "required": [ + "data", + "object" + ], + "title": "ListOpenAIResponseInputItem" + }, + "ListOpenAIResponseObject": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseObjectWithInput" + } + }, + "has_more": { + "type": "boolean" + }, + "first_id": { + "type": "string" + }, + "last_id": { + "type": "string" + }, + "object": { + "type": "string", + "const": "list", + "default": "list" + } + }, + "additionalProperties": false, + "required": [ + "data", + "has_more", + "first_id", + "last_id", + "object" + ], + "title": "ListOpenAIResponseObject" + }, + "OpenAIResponseObjectWithInput": { + "type": "object", + "properties": { + "created_at": { + "type": "integer" + }, + "error": { + "$ref": "#/components/schemas/OpenAIResponseError" + }, + "id": { + "type": "string" + }, + "model": { + "type": "string" + }, + "object": { + "type": "string", + "const": "response", + "default": "response" + }, + "output": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseOutput" + } + }, + "parallel_tool_calls": { + "type": "boolean", + "default": false + }, + "previous_response_id": { + "type": "string" + }, + "status": { + "type": "string" + }, + "temperature": { + "type": "number" + }, + "top_p": { + "type": "number" + }, + "truncation": { + "type": "string" + }, + "user": { + "type": "string" + }, + "input": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInput" + } + } + }, + "additionalProperties": false, + "required": [ + "created_at", + "id", + "model", + "object", + "output", + "parallel_tool_calls", + "status", + "input" + ], + "title": "OpenAIResponseObjectWithInput" + }, "ListProvidersResponse": { "type": "object", "properties": { @@ -8595,6 +10826,15 @@ } } }, + "EventType": { + "type": "string", + "enum": [ + "unstructured_log", + "structured_log", + "metric" + ], + "title": "EventType" + }, "LogSeverity": { "type": "string", "enum": [ @@ -8643,7 +10883,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "metric", "default": "metric" }, @@ -8680,7 +10920,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/StructuredLogType", "const": "span_end", "default": "span_end" }, @@ -8699,7 +10939,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/StructuredLogType", "const": "span_start", "default": "span_start" }, @@ -8753,7 +10993,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "structured_log", "default": "structured_log" }, @@ -8788,6 +11028,14 @@ } } }, + "StructuredLogType": { + "type": "string", + "enum": [ + "span_start", + "span_end" + ], + "title": "StructuredLogType" + }, "UnstructuredLogEvent": { "type": "object", "properties": { @@ -8824,7 +11072,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "unstructured_log", "default": "unstructured_log" }, @@ -8850,10 +11098,12 @@ "type": "object", "properties": { "event": { - "$ref": "#/components/schemas/Event" + "$ref": "#/components/schemas/Event", + "description": "The event to log." }, "ttl_seconds": { - "type": "integer" + "type": "integer", + "description": "The time to live of the event." } }, "additionalProperties": false, @@ -8863,192 +11113,6 @@ ], "title": "LogEventRequest" }, - "OpenAIAssistantMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant", - "description": "Must be \"assistant\" to identify this as the model's response" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the model's response" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the assistant message participant." - }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" - }, - "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object." - } - }, - "additionalProperties": false, - "required": [ - "role" - ], - "title": "OpenAIAssistantMessageParam", - "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." - }, - "OpenAIChatCompletionContentPartImageParam": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "image_url", - "default": "image_url" - }, - "image_url": { - "$ref": "#/components/schemas/OpenAIImageURL" - } - }, - "additionalProperties": false, - "required": [ - "type", - "image_url" - ], - "title": "OpenAIChatCompletionContentPartImageParam" - }, - "OpenAIChatCompletionContentPartParam": { - "oneOf": [ - { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" - }, - { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam", - "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" - } - } - }, - "OpenAIChatCompletionContentPartTextParam": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "text", - "default": "text" - }, - "text": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "type", - "text" - ], - "title": "OpenAIChatCompletionContentPartTextParam" - }, - "OpenAIChatCompletionToolCall": { - "type": "object", - "properties": { - "index": { - "type": "integer" - }, - "id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "function", - "default": "function" - }, - "function": { - "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "OpenAIChatCompletionToolCall" - }, - "OpenAIChatCompletionToolCallFunction": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "additionalProperties": false, - "title": "OpenAIChatCompletionToolCallFunction" - }, - "OpenAIDeveloperMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "developer", - "default": "developer", - "description": "Must be \"developer\" to identify this as a developer message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the developer message" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the developer message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAIDeveloperMessageParam", - "description": "A message from the developer in an OpenAI-compatible chat completion request." - }, - "OpenAIImageURL": { - "type": "object", - "properties": { - "url": { - "type": "string" - }, - "detail": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "url" - ], - "title": "OpenAIImageURL" - }, "OpenAIJSONSchema": { "type": "object", "properties": { @@ -9093,35 +11157,6 @@ ], "title": "OpenAIJSONSchema" }, - "OpenAIMessageParam": { - "oneOf": [ - { - "$ref": "#/components/schemas/OpenAIUserMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAISystemMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIAssistantMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIToolMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIDeveloperMessageParam" - } - ], - "discriminator": { - "propertyName": "role", - "mapping": { - "user": "#/components/schemas/OpenAIUserMessageParam", - "system": "#/components/schemas/OpenAISystemMessageParam", - "assistant": "#/components/schemas/OpenAIAssistantMessageParam", - "tool": "#/components/schemas/OpenAIToolMessageParam", - "developer": "#/components/schemas/OpenAIDeveloperMessageParam" - } - } - }, "OpenAIResponseFormatJSONObject": { "type": "object", "properties": { @@ -9192,115 +11227,6 @@ ], "title": "OpenAIResponseFormatText" }, - "OpenAISystemMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "system", - "default": "system", - "description": "Must be \"system\" to identify this as a system message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." - }, - "name": { - "type": "string", - "description": "(Optional) The name of the system message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAISystemMessageParam", - "description": "A system message providing instructions or context to the model." - }, - "OpenAIToolMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "tool", - "default": "tool", - "description": "Must be \"tool\" to identify this as a tool response" - }, - "tool_call_id": { - "type": "string", - "description": "Unique identifier for the tool call this response is for" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The response content from the tool" - } - }, - "additionalProperties": false, - "required": [ - "role", - "tool_call_id", - "content" - ], - "title": "OpenAIToolMessageParam", - "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request." - }, - "OpenAIUserMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "user", - "default": "user", - "description": "Must be \"user\" to identify this as a user message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the message, which can include text and other media" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the user message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAIUserMessageParam", - "description": "A message from the user in an OpenAI-compatible chat completion request." - }, "OpenaiChatCompletionRequest": { "type": "object", "properties": { @@ -9313,11 +11239,11 @@ "items": { "$ref": "#/components/schemas/OpenAIMessageParam" }, - "description": "List of messages in the conversation" + "description": "List of messages in the conversation." }, "frequency_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "function_call": { "oneOf": [ @@ -9350,7 +11276,7 @@ } } ], - "description": "(Optional) The function call to use" + "description": "(Optional) The function call to use." }, "functions": { "type": "array", @@ -9379,46 +11305,46 @@ ] } }, - "description": "(Optional) List of functions to use" + "description": "(Optional) List of functions to use." }, "logit_bias": { "type": "object", "additionalProperties": { "type": "number" }, - "description": "(Optional) The logit bias to use" + "description": "(Optional) The logit bias to use." }, "logprobs": { "type": "boolean", - "description": "(Optional) The log probabilities to use" + "description": "(Optional) The log probabilities to use." }, "max_completion_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "max_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "n": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "parallel_tool_calls": { "type": "boolean", - "description": "(Optional) Whether to parallelize tool calls" + "description": "(Optional) Whether to parallelize tool calls." }, "presence_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "response_format": { "$ref": "#/components/schemas/OpenAIResponseFormatParam", - "description": "(Optional) The response format to use" + "description": "(Optional) The response format to use." }, "seed": { "type": "integer", - "description": "(Optional) The seed to use" + "description": "(Optional) The seed to use." }, "stop": { "oneOf": [ @@ -9432,11 +11358,11 @@ } } ], - "description": "(Optional) The stop tokens to use" + "description": "(Optional) The stop tokens to use." }, "stream": { "type": "boolean", - "description": "(Optional) Whether to stream the response" + "description": "(Optional) Whether to stream the response." }, "stream_options": { "type": "object", @@ -9462,11 +11388,11 @@ } ] }, - "description": "(Optional) The stream options to use" + "description": "(Optional) The stream options to use." }, "temperature": { "type": "number", - "description": "(Optional) The temperature to use" + "description": "(Optional) The temperature to use." }, "tool_choice": { "oneOf": [ @@ -9499,7 +11425,7 @@ } } ], - "description": "(Optional) The tool choice to use" + "description": "(Optional) The tool choice to use." }, "tools": { "type": "array", @@ -9528,19 +11454,19 @@ ] } }, - "description": "(Optional) The tools to use" + "description": "(Optional) The tools to use." }, "top_logprobs": { "type": "integer", - "description": "(Optional) The top log probabilities to use" + "description": "(Optional) The top log probabilities to use." }, "top_p": { "type": "number", - "description": "(Optional) The top p to use" + "description": "(Optional) The top p to use." }, "user": { "type": "string", - "description": "(Optional) The user to use" + "description": "(Optional) The user to use." } }, "additionalProperties": false, @@ -9630,35 +11556,6 @@ "title": "OpenAIChatCompletionChunk", "description": "Chunk from a streaming response to an OpenAI-compatible chat completion request." }, - "OpenAIChoice": { - "type": "object", - "properties": { - "message": { - "$ref": "#/components/schemas/OpenAIMessageParam", - "description": "The message from the model" - }, - "finish_reason": { - "type": "string", - "description": "The reason the model stopped generating" - }, - "index": { - "type": "integer", - "description": "The index of the choice" - }, - "logprobs": { - "$ref": "#/components/schemas/OpenAIChoiceLogprobs", - "description": "(Optional) The log probabilities for the tokens in the message" - } - }, - "additionalProperties": false, - "required": [ - "message", - "finish_reason", - "index" - ], - "title": "OpenAIChoice", - "description": "A choice from an OpenAI-compatible chat completion response." - }, "OpenAIChoiceDelta": { "type": "object", "properties": { @@ -9686,28 +11583,6 @@ "title": "OpenAIChoiceDelta", "description": "A delta from an OpenAI-compatible chat completion streaming response." }, - "OpenAIChoiceLogprobs": { - "type": "object", - "properties": { - "content": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITokenLogProb" - }, - "description": "(Optional) The log probabilities for the tokens in the message" - }, - "refusal": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITokenLogProb" - }, - "description": "(Optional) The log probabilities for the tokens in the message" - } - }, - "additionalProperties": false, - "title": "OpenAIChoiceLogprobs", - "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." - }, "OpenAIChunkChoice": { "type": "object", "properties": { @@ -9737,61 +11612,6 @@ "title": "OpenAIChunkChoice", "description": "A chunk choice from an OpenAI-compatible chat completion streaming response." }, - "OpenAITokenLogProb": { - "type": "object", - "properties": { - "token": { - "type": "string" - }, - "bytes": { - "type": "array", - "items": { - "type": "integer" - } - }, - "logprob": { - "type": "number" - }, - "top_logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITopLogProb" - } - } - }, - "additionalProperties": false, - "required": [ - "token", - "logprob", - "top_logprobs" - ], - "title": "OpenAITokenLogProb", - "description": "The log probability for a token from an OpenAI-compatible chat completion response." - }, - "OpenAITopLogProb": { - "type": "object", - "properties": { - "token": { - "type": "string" - }, - "bytes": { - "type": "array", - "items": { - "type": "integer" - } - }, - "logprob": { - "type": "number" - } - }, - "additionalProperties": false, - "required": [ - "token", - "logprob" - ], - "title": "OpenAITopLogProb", - "description": "The top log probability for a token from an OpenAI-compatible chat completion response." - }, "OpenaiCompletionRequest": { "type": "object", "properties": { @@ -9826,46 +11646,46 @@ } } ], - "description": "The prompt to generate a completion for" + "description": "The prompt to generate a completion for." }, "best_of": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "echo": { "type": "boolean", - "description": "(Optional) Whether to echo the prompt" + "description": "(Optional) Whether to echo the prompt." }, "frequency_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "logit_bias": { "type": "object", "additionalProperties": { "type": "number" }, - "description": "(Optional) The logit bias to use" + "description": "(Optional) The logit bias to use." }, "logprobs": { "type": "boolean", - "description": "(Optional) The log probabilities to use" + "description": "(Optional) The log probabilities to use." }, "max_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "n": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "presence_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "seed": { "type": "integer", - "description": "(Optional) The seed to use" + "description": "(Optional) The seed to use." }, "stop": { "oneOf": [ @@ -9879,11 +11699,11 @@ } } ], - "description": "(Optional) The stop tokens to use" + "description": "(Optional) The stop tokens to use." }, "stream": { "type": "boolean", - "description": "(Optional) Whether to stream the response" + "description": "(Optional) Whether to stream the response." }, "stream_options": { "type": "object", @@ -9909,19 +11729,19 @@ } ] }, - "description": "(Optional) The stream options to use" + "description": "(Optional) The stream options to use." }, "temperature": { "type": "number", - "description": "(Optional) The temperature to use" + "description": "(Optional) The temperature to use." }, "top_p": { "type": "number", - "description": "(Optional) The top p to use" + "description": "(Optional) The top p to use." }, "user": { "type": "string", - "description": "(Optional) The user to use" + "description": "(Optional) The user to use." }, "guided_choice": { "type": "array", @@ -10000,6 +11820,139 @@ "title": "OpenAICompletionChoice", "description": "A choice from an OpenAI-compatible completion response." }, + "OpenaiEmbeddingsRequest": { + "type": "object", + "properties": { + "model": { + "type": "string", + "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." + }, + "input": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings." + }, + "encoding_format": { + "type": "string", + "description": "(Optional) The format to return the embeddings in. Can be either \"float\" or \"base64\". Defaults to \"float\"." + }, + "dimensions": { + "type": "integer", + "description": "(Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models." + }, + "user": { + "type": "string", + "description": "(Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse." + } + }, + "additionalProperties": false, + "required": [ + "model", + "input" + ], + "title": "OpenaiEmbeddingsRequest" + }, + "OpenAIEmbeddingData": { + "type": "object", + "properties": { + "object": { + "type": "string", + "const": "embedding", + "default": "embedding", + "description": "The object type, which will be \"embedding\"" + }, + "embedding": { + "oneOf": [ + { + "type": "array", + "items": { + "type": "number" + } + }, + { + "type": "string" + } + ], + "description": "The embedding vector as a list of floats (when encoding_format=\"float\") or as a base64-encoded string (when encoding_format=\"base64\")" + }, + "index": { + "type": "integer", + "description": "The index of the embedding in the input list" + } + }, + "additionalProperties": false, + "required": [ + "object", + "embedding", + "index" + ], + "title": "OpenAIEmbeddingData", + "description": "A single embedding data object from an OpenAI-compatible embeddings response." + }, + "OpenAIEmbeddingUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "The number of tokens in the input" + }, + "total_tokens": { + "type": "integer", + "description": "The total number of tokens used" + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "total_tokens" + ], + "title": "OpenAIEmbeddingUsage", + "description": "Usage information for an OpenAI-compatible embeddings response." + }, + "OpenAIEmbeddingsResponse": { + "type": "object", + "properties": { + "object": { + "type": "string", + "const": "list", + "default": "list", + "description": "The object type, which will be \"list\"" + }, + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIEmbeddingData" + }, + "description": "List of embedding data objects" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the embeddings" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIEmbeddingUsage", + "description": "Usage information" + } + }, + "additionalProperties": false, + "required": [ + "object", + "data", + "model", + "usage" + ], + "title": "OpenAIEmbeddingsResponse", + "description": "Response from an OpenAI-compatible embeddings request." + }, "OpenAIModel": { "type": "object", "properties": { @@ -10214,16 +12167,20 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to create." }, "finetuned_model": { - "type": "string" + "type": "string", + "description": "The model to fine-tune." }, "algorithm_config": { - "$ref": "#/components/schemas/DPOAlignmentConfig" + "$ref": "#/components/schemas/DPOAlignmentConfig", + "description": "The algorithm configuration." }, "training_config": { - "$ref": "#/components/schemas/TrainingConfig" + "$ref": "#/components/schemas/TrainingConfig", + "description": "The training configuration." }, "hyperparam_search_config": { "type": "object", @@ -10248,7 +12205,8 @@ "type": "object" } ] - } + }, + "description": "The hyperparam search configuration." }, "logger_config": { "type": "object", @@ -10273,7 +12231,8 @@ "type": "object" } ] - } + }, + "description": "The logger configuration." } }, "additionalProperties": false, @@ -10347,24 +12306,38 @@ "type": "object", "properties": { "query_generator_config": { - "$ref": "#/components/schemas/RAGQueryGeneratorConfig" + "$ref": "#/components/schemas/RAGQueryGeneratorConfig", + "description": "Configuration for the query generator." }, "max_tokens_in_context": { "type": "integer", - "default": 4096 + "default": 4096, + "description": "Maximum number of tokens in the context." }, "max_chunks": { "type": "integer", - "default": 5 + "default": 5, + "description": "Maximum number of chunks to retrieve." + }, + "chunk_template": { + "type": "string", + "default": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n", + "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\"" + }, + "mode": { + "type": "string", + "description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"." } }, "additionalProperties": false, "required": [ "query_generator_config", "max_tokens_in_context", - "max_chunks" + "max_chunks", + "chunk_template" ], - "title": "RAGQueryConfig" + "title": "RAGQueryConfig", + "description": "Configuration for the RAG query generation." }, "RAGQueryGeneratorConfig": { "oneOf": [ @@ -10448,10 +12421,12 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to query." }, "query": { - "$ref": "#/components/schemas/InterleavedContent" + "$ref": "#/components/schemas/InterleavedContent", + "description": "The query to search for." }, "params": { "type": "object", @@ -10476,7 +12451,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the query." } }, "additionalProperties": false, @@ -10495,7 +12471,8 @@ "type": "object", "properties": { "content": { - "$ref": "#/components/schemas/InterleavedContent" + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the chunk, which can be interleaved text, images, or other types." }, "metadata": { "type": "object", @@ -10520,7 +12497,15 @@ "type": "object" } ] - } + }, + "description": "Metadata associated with the chunk, such as document ID, source, or other relevant information." + }, + "embedding": { + "type": "array", + "items": { + "type": "number" + }, + "description": "Optional embedding for the chunk. If not provided, it will be computed later." } }, "additionalProperties": false, @@ -10528,7 +12513,8 @@ "content", "metadata" ], - "title": "Chunk" + "title": "Chunk", + "description": "A chunk of content that can be inserted into a vector database." } }, "scores": { @@ -10545,6 +12531,147 @@ ], "title": "QueryChunksResponse" }, + "QueryMetricsRequest": { + "type": "object", + "properties": { + "start_time": { + "type": "integer", + "description": "The start time of the metric to query." + }, + "end_time": { + "type": "integer", + "description": "The end time of the metric to query." + }, + "granularity": { + "type": "string", + "description": "The granularity of the metric to query." + }, + "query_type": { + "type": "string", + "enum": [ + "range", + "instant" + ], + "description": "The type of query to perform." + }, + "label_matchers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": [ + "=", + "!=", + "=~", + "!~" + ], + "title": "MetricLabelOperator", + "default": "=" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value", + "operator" + ], + "title": "MetricLabelMatcher" + }, + "description": "The label matchers to apply to the metric." + } + }, + "additionalProperties": false, + "required": [ + "start_time", + "query_type" + ], + "title": "QueryMetricsRequest" + }, + "MetricDataPoint": { + "type": "object", + "properties": { + "timestamp": { + "type": "integer" + }, + "value": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "timestamp", + "value" + ], + "title": "MetricDataPoint" + }, + "MetricLabel": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value" + ], + "title": "MetricLabel" + }, + "MetricSeries": { + "type": "object", + "properties": { + "metric": { + "type": "string" + }, + "labels": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricLabel" + } + }, + "values": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricDataPoint" + } + } + }, + "additionalProperties": false, + "required": [ + "metric", + "labels", + "values" + ], + "title": "MetricSeries" + }, + "QueryMetricsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricSeries" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "QueryMetricsResponse" + }, "QueryCondition": { "type": "object", "properties": { @@ -10602,16 +12729,19 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the spans." }, "attributes_to_return": { "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to return in the spans." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -10644,19 +12774,23 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the traces." }, "limit": { - "type": "integer" + "type": "integer", + "description": "The limit of traces to return." }, "offset": { - "type": "integer" + "type": "integer", + "description": "The offset of the traces to return." }, "order_by": { "type": "array", "items": { "type": "string" - } + }, + "description": "The order by of the traces to return." } }, "additionalProperties": false, @@ -10682,22 +12816,27 @@ "type": "object", "properties": { "benchmark_id": { - "type": "string" + "type": "string", + "description": "The ID of the benchmark to register." }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to use for the benchmark." }, "scoring_functions": { "type": "array", "items": { "type": "string" - } + }, + "description": "The scoring functions to use for the benchmark." }, "provider_benchmark_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider benchmark to use for the benchmark." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the benchmark." }, "metadata": { "type": "object", @@ -10722,7 +12861,8 @@ "type": "object" } ] - } + }, + "description": "The metadata to use for the benchmark." } }, "additionalProperties": false, @@ -10743,7 +12883,7 @@ "eval/question-answer", "eval/messages-answer" ], - "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/question-answer\": The dataset contains a question column and an answer column for evaluation. { \"question\": \"What is the capital of France?\", \"answer\": \"Paris\" } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column for evaluation. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, my name is John Doe.\"}, {\"role\": \"assistant\", \"content\": \"Hello, John Doe. How can I help you today?\"}, {\"role\": \"user\", \"content\": \"What's my name?\"}, ], \"answer\": \"John Doe\" }" + "description": "The purpose of the dataset. One of: - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/question-answer\": The dataset contains a question column and an answer column for evaluation. { \"question\": \"What is the capital of France?\", \"answer\": \"Paris\" } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column for evaluation. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, my name is John Doe.\"}, {\"role\": \"assistant\", \"content\": \"Hello, John Doe. How can I help you today?\"}, {\"role\": \"user\", \"content\": \"What's my name?\"}, ], \"answer\": \"John Doe\" }" }, "source": { "$ref": "#/components/schemas/DataSource", @@ -10773,7 +12913,7 @@ } ] }, - "description": "The metadata for the dataset. - E.g. {\"description\": \"My dataset\"}" + "description": "The metadata for the dataset. - E.g. {\"description\": \"My dataset\"}." }, "dataset_id": { "type": "string", @@ -10791,13 +12931,16 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to register." }, "provider_model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model in the provider." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "metadata": { "type": "object", @@ -10822,10 +12965,12 @@ "type": "object" } ] - } + }, + "description": "Any additional metadata for this model." }, "model_type": { - "$ref": "#/components/schemas/ModelType" + "$ref": "#/components/schemas/ModelType", + "description": "The type of model to register." } }, "additionalProperties": false, @@ -10838,22 +12983,28 @@ "type": "object", "properties": { "scoring_fn_id": { - "type": "string" + "type": "string", + "description": "The ID of the scoring function to register." }, "description": { - "type": "string" + "type": "string", + "description": "The description of the scoring function." }, "return_type": { - "$ref": "#/components/schemas/ParamType" + "$ref": "#/components/schemas/ParamType", + "description": "The return type of the scoring function." }, "provider_scoring_fn_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider scoring function to use for the scoring function." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the scoring function." }, "params": { - "$ref": "#/components/schemas/ScoringFnParams" + "$ref": "#/components/schemas/ScoringFnParams", + "description": "The parameters for the scoring function for benchmark eval, these can be overridden for app eval." } }, "additionalProperties": false, @@ -10868,13 +13019,16 @@ "type": "object", "properties": { "shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield to register." }, "provider_shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield in the provider." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "params": { "type": "object", @@ -10899,7 +13053,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the shield." } }, "additionalProperties": false, @@ -10912,13 +13067,16 @@ "type": "object", "properties": { "toolgroup_id": { - "type": "string" + "type": "string", + "description": "The ID of the tool group to register." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the tool group." }, "mcp_endpoint": { - "$ref": "#/components/schemas/URL" + "$ref": "#/components/schemas/URL", + "description": "The MCP endpoint to use for the tool group." }, "args": { "type": "object", @@ -10943,7 +13101,8 @@ "type": "object" } ] - } + }, + "description": "A dictionary of arguments to pass to the tool group." } }, "additionalProperties": false, @@ -10957,19 +13116,24 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to register." }, "embedding_model": { - "type": "string" + "type": "string", + "description": "The embedding model to use." }, "embedding_dimension": { - "type": "integer" + "type": "integer", + "description": "The dimension of the embedding model." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "provider_vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database in the provider." } }, "additionalProperties": false, @@ -11018,13 +13182,15 @@ "type": "object", "properties": { "shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield to run." }, "messages": { "type": "array", "items": { "$ref": "#/components/schemas/Message" - } + }, + "description": "The messages to run the shield on." }, "params": { "type": "object", @@ -11049,7 +13215,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the shield." } }, "additionalProperties": false, @@ -11077,19 +13244,23 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the spans." }, "attributes_to_save": { "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to save to the dataset." }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to save the spans to." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -11176,7 +13347,8 @@ "type": "object", "properties": { "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to score." }, "scoring_functions": { "type": "object", @@ -11189,10 +13361,12 @@ "type": "null" } ] - } + }, + "description": "The scoring functions to use for the scoring." }, "save_results_dataset": { - "type": "boolean" + "type": "boolean", + "description": "Whether to save the results to a dataset." } }, "additionalProperties": false, @@ -11312,10 +13486,12 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to create." }, "training_config": { - "$ref": "#/components/schemas/TrainingConfig" + "$ref": "#/components/schemas/TrainingConfig", + "description": "The training configuration." }, "hyperparam_search_config": { "type": "object", @@ -11340,7 +13516,8 @@ "type": "object" } ] - } + }, + "description": "The hyperparam search configuration." }, "logger_config": { "type": "object", @@ -11365,16 +13542,20 @@ "type": "object" } ] - } + }, + "description": "The logger configuration." }, "model": { - "type": "string" + "type": "string", + "description": "The model to fine-tune." }, "checkpoint_dir": { - "type": "string" + "type": "string", + "description": "The directory to save checkpoint(s) to." }, "algorithm_config": { - "$ref": "#/components/schemas/AlgorithmConfig" + "$ref": "#/components/schemas/AlgorithmConfig", + "description": "The algorithm configuration." } }, "additionalProperties": false, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index a24f1a9db..7638c3cbd 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -27,10 +27,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - DatasetIO - description: '' + description: Append rows to a dataset. parameters: - name: dataset_id in: path + description: >- + The ID of the dataset to append the rows to. required: true schema: type: string @@ -44,7 +46,8 @@ paths: post: responses: '200': - description: OK + description: >- + A BatchChatCompletionResponse with the full completions. content: application/json: schema: @@ -61,7 +64,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inference - description: '' + description: >- + Generate chat completions for a batch of messages using the specified model. parameters: [] requestBody: content: @@ -73,7 +77,8 @@ paths: post: responses: '200': - description: OK + description: >- + A BatchCompletionResponse with the full completions. content: application/json: schema: @@ -90,7 +95,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inference - description: '' + description: >- + Generate completions for a batch of content using the specified model. parameters: [] requestBody: content: @@ -115,7 +121,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Cancel a training job. parameters: [] requestBody: content: @@ -129,7 +135,7 @@ paths: '200': description: >- If stream=False, returns a ChatCompletionResponse with the full completion. - If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk + If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk. content: application/json: schema: @@ -164,7 +170,7 @@ paths: '200': description: >- If stream=False, returns a CompletionResponse with the full completion. - If stream=True, returns an SSE event stream of CompletionResponseStreamChunk + If stream=True, returns an SSE event stream of CompletionResponseStreamChunk. content: application/json: schema: @@ -197,11 +203,11 @@ paths: get: responses: '200': - description: A ListAgentsResponse. + description: A PaginatedResponse. content: application/json: schema: - $ref: '#/components/schemas/ListAgentsResponse' + $ref: '#/components/schemas/PaginatedResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -215,7 +221,19 @@ paths: tags: - Agents description: List all agents. - parameters: [] + parameters: + - name: start_index + in: query + description: The index to start the pagination from. + required: false + schema: + type: integer + - name: limit + in: query + description: The number of agents to return. + required: false + schema: + type: integer post: responses: '200': @@ -288,7 +306,7 @@ paths: '200': description: >- If stream=False, returns a Turn object. If stream=True, returns an SSE - event stream of AgentTurnResponseStreamChunk + event stream of AgentTurnResponseStreamChunk. content: application/json: schema: @@ -330,11 +348,90 @@ paths: schema: $ref: '#/components/schemas/CreateAgentTurnRequest' required: true + /v1/openai/v1/responses: + get: + responses: + '200': + description: A ListOpenAIResponseObject. + content: + application/json: + schema: + $ref: '#/components/schemas/ListOpenAIResponseObject' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: List all OpenAI responses. + parameters: + - name: after + in: query + description: The ID of the last response to return. + required: false + schema: + type: string + - name: limit + in: query + description: The number of responses to return. + required: false + schema: + type: integer + - name: model + in: query + description: The model to filter responses by. + required: false + schema: + type: string + - name: order + in: query + description: >- + The order to sort responses by when sorted by created_at ('asc' or 'desc'). + required: false + schema: + $ref: '#/components/schemas/Order' + post: + responses: + '200': + description: An OpenAIResponseObject. + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAIResponseObject' + text/event-stream: + schema: + $ref: '#/components/schemas/OpenAIResponseObjectStream' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: Create a new OpenAI response. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/CreateOpenaiResponseRequest' + required: true /v1/files: get: responses: '200': - description: OK + description: A ListBucketResponse. content: application/json: schema: @@ -355,13 +452,14 @@ paths: parameters: - name: bucket in: query + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string post: responses: '200': - description: OK + description: A FileUploadResponse. content: application/json: schema: @@ -432,7 +530,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Agents - description: Delete an agent by its ID. + description: >- + Delete an agent by its ID and its associated sessions and turns. parameters: - name: agent_id in: path @@ -444,7 +543,7 @@ paths: get: responses: '200': - description: OK + description: A Session. content: application/json: schema: @@ -501,7 +600,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Agents - description: Delete an agent session by its ID. + description: >- + Delete an agent session by its ID and its associated turns. parameters: - name: session_id in: path @@ -520,7 +620,7 @@ paths: get: responses: '200': - description: OK + description: A FileResponse. content: application/json: schema: @@ -542,14 +642,14 @@ paths: parameters: - name: bucket in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string - name: key in: path description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). required: true schema: type: string @@ -574,14 +674,14 @@ paths: parameters: - name: bucket in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string - name: key in: path description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). required: true schema: type: string @@ -592,7 +692,7 @@ paths: description: >- An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you - can check model metadata using /models/{model_id} + can check model metadata using /models/{model_id}. content: application/json: schema: @@ -623,7 +723,7 @@ paths: responses: '200': description: >- - EvaluateResponse object containing generations and scores + EvaluateResponse object containing generations and scores. content: application/json: schema: @@ -749,7 +849,7 @@ paths: get: responses: '200': - description: OK + description: A Benchmark. content: application/json: schema: @@ -766,10 +866,40 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Get a benchmark by its ID. parameters: - name: benchmark_id in: path + description: The ID of the benchmark to get. + required: true + schema: + type: string + /v1/openai/v1/chat/completions/{completion_id}: + get: + responses: + '200': + description: A OpenAICompletionWithInputMessages. + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAICompletionWithInputMessages' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: Describe a chat completion by its ID. + parameters: + - name: completion_id + in: path + description: ID of the chat completion. required: true schema: type: string @@ -777,7 +907,7 @@ paths: get: responses: '200': - description: OK + description: A Dataset. content: application/json: schema: @@ -794,10 +924,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: '' + description: Get a dataset by its ID. parameters: - name: dataset_id in: path + description: The ID of the dataset to get. required: true schema: type: string @@ -817,10 +948,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: '' + description: Unregister a dataset by its ID. parameters: - name: dataset_id in: path + description: The ID of the dataset to unregister. required: true schema: type: string @@ -828,7 +960,7 @@ paths: get: responses: '200': - description: OK + description: A Model. content: application/json: schema: @@ -845,10 +977,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: Get a model by its identifier. parameters: - name: model_id in: path + description: The identifier of the model to get. required: true schema: type: string @@ -868,10 +1001,42 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: Unregister a model. parameters: - name: model_id in: path + description: >- + The identifier of the model to unregister. + required: true + schema: + type: string + /v1/openai/v1/responses/{response_id}: + get: + responses: + '200': + description: An OpenAIResponseObject. + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAIResponseObject' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: Retrieve an OpenAI response by its ID. + parameters: + - name: response_id + in: path + description: >- + The ID of the OpenAI response to retrieve. required: true schema: type: string @@ -879,7 +1044,7 @@ paths: get: responses: '200': - description: OK + description: A ScoringFn. content: application/json: schema: @@ -896,10 +1061,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ScoringFunctions - description: '' + description: Get a scoring function by its ID. parameters: - name: scoring_fn_id in: path + description: The ID of the scoring function to get. required: true schema: type: string @@ -907,7 +1073,7 @@ paths: get: responses: '200': - description: OK + description: A Shield. content: application/json: schema: @@ -924,10 +1090,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Shields - description: '' + description: Get a shield by its identifier. parameters: - name: identifier in: path + description: The identifier of the shield to get. required: true schema: type: string @@ -935,7 +1102,7 @@ paths: get: responses: '200': - description: OK + description: A Span. content: application/json: schema: @@ -952,15 +1119,18 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a span by its ID. parameters: - name: trace_id in: path + description: >- + The ID of the trace to get the span from. required: true schema: type: string - name: span_id in: path + description: The ID of the span to get. required: true schema: type: string @@ -968,7 +1138,7 @@ paths: post: responses: '200': - description: OK + description: A QuerySpanTreeResponse. content: application/json: schema: @@ -985,10 +1155,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a span tree by its ID. parameters: - name: span_id in: path + description: The ID of the span to get the tree from. required: true schema: type: string @@ -1002,7 +1173,7 @@ paths: get: responses: '200': - description: OK + description: A Tool. content: application/json: schema: @@ -1019,10 +1190,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: '' + description: Get a tool by its name. parameters: - name: tool_name in: path + description: The name of the tool to get. required: true schema: type: string @@ -1030,7 +1202,7 @@ paths: get: responses: '200': - description: OK + description: A ToolGroup. content: application/json: schema: @@ -1047,10 +1219,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: '' + description: Get a tool group by its ID. parameters: - name: toolgroup_id in: path + description: The ID of the tool group to get. required: true schema: type: string @@ -1070,10 +1243,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: Unregister a tool group + description: Unregister a tool group. parameters: - name: toolgroup_id in: path + description: The ID of the tool group to unregister. required: true schema: type: string @@ -1081,7 +1255,7 @@ paths: get: responses: '200': - description: OK + description: A Trace. content: application/json: schema: @@ -1098,10 +1272,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a trace by its ID. parameters: - name: trace_id in: path + description: The ID of the trace to get. required: true schema: type: string @@ -1109,7 +1284,7 @@ paths: get: responses: '200': - description: OK + description: A PostTrainingJobArtifactsResponse. content: application/json: schema: @@ -1126,10 +1301,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get the artifacts of a training job. parameters: - name: job_uuid in: query + description: >- + The UUID of the job to get the artifacts of. required: true schema: type: string @@ -1137,7 +1314,7 @@ paths: get: responses: '200': - description: OK + description: A PostTrainingJobStatusResponse. content: application/json: schema: @@ -1154,10 +1331,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get the status of a training job. parameters: - name: job_uuid in: query + description: >- + The UUID of the job to get the status of. required: true schema: type: string @@ -1165,7 +1344,7 @@ paths: get: responses: '200': - description: OK + description: A ListPostTrainingJobsResponse. content: application/json: schema: @@ -1182,13 +1361,13 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get all training jobs. parameters: [] /v1/files/session:{upload_id}: get: responses: '200': - description: OK + description: A FileUploadResponse. content: application/json: schema: @@ -1206,18 +1385,19 @@ paths: tags: - Files description: >- - Returns information about an existsing upload session + Returns information about an existsing upload session. parameters: - name: upload_id in: path - description: ID of the upload session + description: ID of the upload session. required: true schema: type: string post: responses: '200': - description: OK + description: >- + A FileResponse or None if the upload is not complete. content: application/json: schema: @@ -1242,7 +1422,7 @@ paths: parameters: - name: upload_id in: path - description: ID of the upload session + description: ID of the upload session. required: true schema: type: string @@ -1257,7 +1437,7 @@ paths: get: responses: '200': - description: OK + description: A VectorDB. content: application/json: schema: @@ -1274,10 +1454,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorDBs - description: '' + description: Get a vector database by its identifier. parameters: - name: vector_db_id in: path + description: >- + The identifier of the vector database to get. required: true schema: type: string @@ -1297,10 +1479,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorDBs - description: '' + description: Unregister a vector database. parameters: - name: vector_db_id in: path + description: >- + The identifier of the vector database to unregister. required: true schema: type: string @@ -1308,7 +1492,7 @@ paths: get: responses: '200': - description: OK + description: A HealthInfo. content: application/json: schema: @@ -1325,7 +1509,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inspect - description: '' + description: Get the health of the service. parameters: [] /v1/tool-runtime/rag-tool/insert: post: @@ -1370,7 +1554,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorIO - description: '' + description: Insert chunks into a vector database. parameters: [] requestBody: content: @@ -1382,7 +1566,8 @@ paths: get: responses: '200': - description: OK + description: >- + A ProviderInfo object containing the provider's details. content: application/json: schema: @@ -1399,10 +1584,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Providers - description: '' + description: >- + Get detailed information about a specific provider. parameters: - name: provider_id in: path + description: The ID of the provider to inspect. required: true schema: type: string @@ -1410,7 +1597,7 @@ paths: post: responses: '200': - description: OK + description: A ToolInvocationResult. content: application/json: schema: @@ -1427,7 +1614,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolRuntime - description: Run a tool with the given arguments + description: Run a tool with the given arguments. parameters: [] requestBody: content: @@ -1439,7 +1626,7 @@ paths: get: responses: '200': - description: OK + description: A PaginatedResponse. content: application/json: schema: @@ -1468,9 +1655,9 @@ paths: The response includes: - - data: List of items for the current page + - data: List of items for the current page. - - has_more: Whether there are more items available after this set + - has_more: Whether there are more items available after this set. parameters: - name: dataset_id in: path @@ -1496,7 +1683,7 @@ paths: get: responses: '200': - description: The status of the evaluationjob. + description: The status of the evaluation job. content: application/json: schema: @@ -1599,11 +1786,11 @@ paths: get: responses: '200': - description: A ListAgentSessionsResponse. + description: A PaginatedResponse. content: application/json: schema: - $ref: '#/components/schemas/ListAgentSessionsResponse' + $ref: '#/components/schemas/PaginatedResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1625,11 +1812,23 @@ paths: required: true schema: type: string + - name: start_index + in: query + description: The index to start the pagination from. + required: false + schema: + type: integer + - name: limit + in: query + description: The number of sessions to return. + required: false + schema: + type: integer /v1/eval/benchmarks: get: responses: '200': - description: OK + description: A ListBenchmarksResponse. content: application/json: schema: @@ -1646,7 +1845,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: List all benchmarks. parameters: [] post: responses: @@ -1664,7 +1863,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Register a benchmark. parameters: [] requestBody: content: @@ -1672,472 +1871,61 @@ paths: schema: $ref: '#/components/schemas/RegisterBenchmarkRequest' required: true - /v1/datasets: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListDatasetsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Datasets - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Dataset' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Datasets - description: Register a new dataset. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterDatasetRequest' - required: true - /v1/files/{bucket}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListFileResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Files - description: List all files in a bucket. - parameters: - - name: bucket - in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' - required: true - schema: - type: string - /v1/models: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListModelsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Models - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Model' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Models - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterModelRequest' - required: true - /v1/providers: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListProvidersResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Providers - description: '' - parameters: [] - /v1/inspect/routes: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListRoutesResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Inspect - description: '' - parameters: [] - /v1/tool-runtime/list-tools: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolDefsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolRuntime - description: '' - parameters: - - name: tool_group_id - in: query - required: false - schema: - type: string - - name: mcp_endpoint - in: query - required: false - schema: - $ref: '#/components/schemas/URL' - /v1/scoring-functions: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListScoringFunctionsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterScoringFunctionRequest' - required: true - /v1/shields: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListShieldsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Shields - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Shield' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Shields - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterShieldRequest' - required: true - /v1/toolgroups: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolGroupsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: List tool groups with optional provider - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: Register a tool group - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterToolGroupRequest' - required: true - /v1/tools: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: List tools with optional tool group - parameters: - - name: toolgroup_id - in: query - required: false - schema: - type: string - /v1/vector-dbs: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListVectorDBsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - VectorDBs - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/VectorDB' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - VectorDBs - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterVectorDbRequest' - required: true - /v1/telemetry/events: - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Telemetry - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/LogEventRequest' - required: true /v1/openai/v1/chat/completions: + get: + responses: + '200': + description: A ListOpenAIChatCompletionResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListOpenAIChatCompletionResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: List all chat completions. + parameters: + - name: after + in: query + description: >- + The ID of the last chat completion to return. + required: false + schema: + type: string + - name: limit + in: query + description: >- + The maximum number of chat completions to return. + required: false + schema: + type: integer + - name: model + in: query + description: The model to filter by. + required: false + schema: + type: string + - name: order + in: query + description: >- + The order to sort the chat completions by: "asc" or "desc". Defaults to + "desc". + required: false + schema: + $ref: '#/components/schemas/Order' post: responses: '200': - description: >- - Response from an OpenAI-compatible chat completion request. **OR** Chunk - from a streaming response to an OpenAI-compatible chat completion request. + description: An OpenAIChatCompletion. content: application/json: schema: @@ -2166,11 +1954,546 @@ paths: schema: $ref: '#/components/schemas/OpenaiChatCompletionRequest' required: true - /v1/openai/v1/completions: + /v1/datasets: + get: + responses: + '200': + description: A ListDatasetsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListDatasetsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Datasets + description: List all datasets. + parameters: [] + post: + responses: + '200': + description: A Dataset. + content: + application/json: + schema: + $ref: '#/components/schemas/Dataset' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Datasets + description: Register a new dataset. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterDatasetRequest' + required: true + /v1/files/{bucket}: + get: + responses: + '200': + description: A ListFileResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListFileResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Files + description: List all files in a bucket. + parameters: + - name: bucket + in: path + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' + required: true + schema: + type: string + /v1/models: + get: + responses: + '200': + description: A ListModelsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListModelsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Models + description: List all models. + parameters: [] + post: + responses: + '200': + description: A Model. + content: + application/json: + schema: + $ref: '#/components/schemas/Model' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Models + description: Register a model. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterModelRequest' + required: true + /v1/openai/v1/responses/{response_id}/input_items: + get: + responses: + '200': + description: An ListOpenAIResponseInputItem. + content: + application/json: + schema: + $ref: '#/components/schemas/ListOpenAIResponseInputItem' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: >- + List input items for a given OpenAI response. + parameters: + - name: response_id + in: path + description: >- + The ID of the response to retrieve input items for. + required: true + schema: + type: string + - name: after + in: query + description: >- + An item ID to list items after, used for pagination. + required: false + schema: + type: string + - name: before + in: query + description: >- + An item ID to list items before, used for pagination. + required: false + schema: + type: string + - name: include + in: query + description: >- + Additional fields to include in the response. + required: false + schema: + type: array + items: + type: string + - name: limit + in: query + description: >- + A limit on the number of objects to be returned. Limit can range between + 1 and 100, and the default is 20. + required: false + schema: + type: integer + - name: order + in: query + description: >- + The order to return the input items in. Default is desc. + required: false + schema: + $ref: '#/components/schemas/Order' + /v1/providers: + get: + responses: + '200': + description: >- + A ListProvidersResponse containing information about all providers. + content: + application/json: + schema: + $ref: '#/components/schemas/ListProvidersResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Providers + description: List all available providers. + parameters: [] + /v1/inspect/routes: + get: + responses: + '200': + description: A ListRoutesResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListRoutesResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inspect + description: List all routes. + parameters: [] + /v1/tool-runtime/list-tools: + get: + responses: + '200': + description: A ListToolDefsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolDefsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolRuntime + description: List all tools in the runtime. + parameters: + - name: tool_group_id + in: query + description: >- + The ID of the tool group to list tools for. + required: false + schema: + type: string + - name: mcp_endpoint + in: query + description: >- + The MCP endpoint to use for the tool group. + required: false + schema: + $ref: '#/components/schemas/URL' + /v1/scoring-functions: + get: + responses: + '200': + description: A ListScoringFunctionsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListScoringFunctionsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: List all scoring functions. + parameters: [] post: responses: '200': description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: Register a scoring function. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterScoringFunctionRequest' + required: true + /v1/shields: + get: + responses: + '200': + description: A ListShieldsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListShieldsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Shields + description: List all shields. + parameters: [] + post: + responses: + '200': + description: A Shield. + content: + application/json: + schema: + $ref: '#/components/schemas/Shield' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Shields + description: Register a shield. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterShieldRequest' + required: true + /v1/toolgroups: + get: + responses: + '200': + description: A ListToolGroupsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolGroupsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: List tool groups with optional provider. + parameters: [] + post: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: Register a tool group. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterToolGroupRequest' + required: true + /v1/tools: + get: + responses: + '200': + description: A ListToolsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: List tools with optional tool group. + parameters: + - name: toolgroup_id + in: query + description: >- + The ID of the tool group to list tools for. + required: false + schema: + type: string + /v1/vector-dbs: + get: + responses: + '200': + description: A ListVectorDBsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListVectorDBsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - VectorDBs + description: List all vector databases. + parameters: [] + post: + responses: + '200': + description: A VectorDB. + content: + application/json: + schema: + $ref: '#/components/schemas/VectorDB' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - VectorDBs + description: Register a vector database. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterVectorDbRequest' + required: true + /v1/telemetry/events: + post: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Telemetry + description: Log an event. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/LogEventRequest' + required: true + /v1/openai/v1/completions: + post: + responses: + '200': + description: An OpenAICompletion. content: application/json: schema: @@ -2197,11 +2520,43 @@ paths: schema: $ref: '#/components/schemas/OpenaiCompletionRequest' required: true + /v1/openai/v1/embeddings: + post: + responses: + '200': + description: >- + An OpenAIEmbeddingsResponse containing the embeddings. + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAIEmbeddingsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: >- + Generate OpenAI-compatible embeddings for the given input using the specified + model. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/OpenaiEmbeddingsRequest' + required: true /v1/openai/v1/models: get: responses: '200': - description: OK + description: A OpenAIListModelsResponse. content: application/json: schema: @@ -2218,13 +2573,13 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: List models using the OpenAI API. parameters: [] /v1/post-training/preference-optimize: post: responses: '200': - description: OK + description: A PostTrainingJob. content: application/json: schema: @@ -2241,7 +2596,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Run preference optimization of a model. parameters: [] requestBody: content: @@ -2283,7 +2638,7 @@ paths: post: responses: '200': - description: OK + description: A QueryChunksResponse. content: application/json: schema: @@ -2300,7 +2655,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorIO - description: '' + description: Query chunks from a vector database. parameters: [] requestBody: content: @@ -2308,11 +2663,46 @@ paths: schema: $ref: '#/components/schemas/QueryChunksRequest' required: true + /v1/telemetry/metrics/{metric_name}: + post: + responses: + '200': + description: A QueryMetricsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Telemetry + description: Query metrics. + parameters: + - name: metric_name + in: path + description: The name of the metric to query. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsRequest' + required: true /v1/telemetry/spans: post: responses: '200': - description: OK + description: A QuerySpansResponse. content: application/json: schema: @@ -2329,7 +2719,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Query spans. parameters: [] requestBody: content: @@ -2341,7 +2731,7 @@ paths: post: responses: '200': - description: OK + description: A QueryTracesResponse. content: application/json: schema: @@ -2358,7 +2748,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Query traces. parameters: [] requestBody: content: @@ -2464,7 +2854,7 @@ paths: post: responses: '200': - description: OK + description: A RunShieldResponse. content: application/json: schema: @@ -2481,7 +2871,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Safety - description: '' + description: Run a shield. parameters: [] requestBody: content: @@ -2506,7 +2896,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Save spans to a dataset. parameters: [] requestBody: content: @@ -2519,7 +2909,7 @@ paths: responses: '200': description: >- - ScoreResponse object containing rows and aggregated results + A ScoreResponse object containing rows and aggregated results. content: application/json: schema: @@ -2548,7 +2938,7 @@ paths: post: responses: '200': - description: OK + description: A ScoreBatchResponse. content: application/json: schema: @@ -2565,7 +2955,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Scoring - description: '' + description: Score a batch of rows. parameters: [] requestBody: content: @@ -2577,7 +2967,7 @@ paths: post: responses: '200': - description: OK + description: A PostTrainingJob. content: application/json: schema: @@ -2594,7 +2984,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Run supervised fine-tuning of a model. parameters: [] requestBody: content: @@ -2635,7 +3025,7 @@ paths: get: responses: '200': - description: OK + description: A VersionInfo. content: application/json: schema: @@ -2652,7 +3042,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inspect - description: '' + description: Get the version of the service. parameters: [] jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema @@ -2701,6 +3091,7 @@ components: - type: string - type: array - type: object + description: The rows to append to the dataset. additionalProperties: false required: - rows @@ -2749,10 +3140,13 @@ components: properties: type: type: string - const: grammar - default: grammar + enum: + - json_schema + - grammar description: >- Must be "grammar" to identify this format type + const: grammar + default: grammar bnf: type: object additionalProperties: @@ -2834,10 +3228,13 @@ components: properties: type: type: string - const: json_schema - default: json_schema + enum: + - json_schema + - grammar description: >- Must be "json_schema" to identify this format type + const: json_schema + default: json_schema json_schema: type: object additionalProperties: @@ -3199,22 +3596,34 @@ components: properties: model_id: type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. messages_batch: type: array items: type: array items: $ref: '#/components/schemas/Message' + description: >- + The messages to generate completions for. sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy. tools: type: array items: $ref: '#/components/schemas/ToolDefinition' + description: >- + (Optional) List of tool definitions available to the model. tool_config: $ref: '#/components/schemas/ToolConfig' + description: (Optional) Configuration for tool use. response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding. logprobs: type: object properties: @@ -3224,7 +3633,9 @@ components: description: >- How many tokens (for each position) to return log probabilities for. additionalProperties: false - title: LogProbConfig + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id @@ -3297,14 +3708,22 @@ components: properties: model_id: type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. content_batch: type: array items: $ref: '#/components/schemas/InterleavedContent' + description: The content to generate completions for. sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy. response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding. logprobs: type: object properties: @@ -3314,7 +3733,9 @@ components: description: >- How many tokens (for each position) to return log probabilities for. additionalProperties: false - title: LogProbConfig + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id @@ -3365,6 +3786,7 @@ components: properties: job_uuid: type: string + description: The UUID of the job to cancel. additionalProperties: false required: - job_uuid @@ -3381,17 +3803,17 @@ components: type: array items: $ref: '#/components/schemas/Message' - description: List of messages in the conversation + description: List of messages in the conversation. sampling_params: $ref: '#/components/schemas/SamplingParams' description: >- - Parameters to control the sampling strategy + Parameters to control the sampling strategy. tools: type: array items: $ref: '#/components/schemas/ToolDefinition' description: >- - (Optional) List of tool definitions available to the model + (Optional) List of tool definitions available to the model. tool_choice: type: string enum: @@ -3574,15 +3996,16 @@ components: Llama Stack and available via the /models endpoint. content: $ref: '#/components/schemas/InterleavedContent' - description: The content to generate a completion for + description: >- + The content to generate a completion for. sampling_params: $ref: '#/components/schemas/SamplingParams' description: >- - (Optional) Parameters to control the sampling strategy + (Optional) Parameters to control the sampling strategy. response_format: $ref: '#/components/schemas/ResponseFormat' description: >- - (Optional) Grammar specification for guided (structured) decoding + (Optional) Grammar specification for guided (structured) decoding. stream: type: boolean description: >- @@ -3896,6 +4319,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: inference default: inference model_response: @@ -3928,6 +4358,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval vector_db_ids: @@ -3989,6 +4426,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: shield_call default: shield_call violation: @@ -4020,6 +4464,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: tool_execution default: tool_execution tool_calls: @@ -4182,6 +4633,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_complete default: step_complete step_type: @@ -4220,6 +4679,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_progress default: step_progress step_type: @@ -4247,6 +4714,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_start default: step_start step_type: @@ -4291,6 +4766,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_awaiting_input default: turn_awaiting_input turn: @@ -4306,6 +4789,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_complete default: turn_complete turn: @@ -4320,6 +4811,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_start default: turn_start turn_id: @@ -4329,23 +4828,586 @@ components: - event_type - turn_id title: AgentTurnResponseTurnStartPayload + OpenAIResponseInput: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' + - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' + - $ref: '#/components/schemas/OpenAIResponseMessage' + "OpenAIResponseInputFunctionToolCallOutput": + type: object + properties: + call_id: + type: string + output: + type: string + type: + type: string + const: function_call_output + default: function_call_output + id: + type: string + status: + type: string + additionalProperties: false + required: + - call_id + - output + - type + title: >- + OpenAIResponseInputFunctionToolCallOutput + description: >- + This represents the output of a function call that gets passed back to the + model. + OpenAIResponseInputMessageContent: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText' + - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage' + discriminator: + propertyName: type + mapping: + input_text: '#/components/schemas/OpenAIResponseInputMessageContentText' + input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage' + OpenAIResponseInputMessageContentImage: + type: object + properties: + detail: + oneOf: + - type: string + const: low + - type: string + const: high + - type: string + const: auto + default: auto + type: + type: string + const: input_image + default: input_image + image_url: + type: string + additionalProperties: false + required: + - detail + - type + title: OpenAIResponseInputMessageContentImage + OpenAIResponseInputMessageContentText: + type: object + properties: + text: + type: string + type: + type: string + const: input_text + default: input_text + additionalProperties: false + required: + - text + - type + title: OpenAIResponseInputMessageContentText + OpenAIResponseInputTool: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' + - $ref: '#/components/schemas/OpenAIResponseInputToolFileSearch' + - $ref: '#/components/schemas/OpenAIResponseInputToolFunction' + - $ref: '#/components/schemas/OpenAIResponseInputToolMCP' + discriminator: + propertyName: type + mapping: + web_search: '#/components/schemas/OpenAIResponseInputToolWebSearch' + file_search: '#/components/schemas/OpenAIResponseInputToolFileSearch' + function: '#/components/schemas/OpenAIResponseInputToolFunction' + mcp: '#/components/schemas/OpenAIResponseInputToolMCP' + OpenAIResponseInputToolFileSearch: + type: object + properties: + type: + type: string + const: file_search + default: file_search + vector_store_id: + type: array + items: + type: string + ranking_options: + type: object + properties: + ranker: + type: string + score_threshold: + type: number + default: 0.0 + additionalProperties: false + title: FileSearchRankingOptions + additionalProperties: false + required: + - type + - vector_store_id + title: OpenAIResponseInputToolFileSearch + OpenAIResponseInputToolFunction: + type: object + properties: + type: + type: string + const: function + default: function + name: + type: string + description: + type: string + parameters: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + strict: + type: boolean + additionalProperties: false + required: + - type + - name + title: OpenAIResponseInputToolFunction + OpenAIResponseInputToolMCP: + type: object + properties: + type: + type: string + const: mcp + default: mcp + server_label: + type: string + server_url: + type: string + headers: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + require_approval: + oneOf: + - type: string + const: always + - type: string + const: never + - type: object + properties: + always: + type: array + items: + type: string + never: + type: array + items: + type: string + additionalProperties: false + title: ApprovalFilter + default: never + allowed_tools: + oneOf: + - type: array + items: + type: string + - type: object + properties: + tool_names: + type: array + items: + type: string + additionalProperties: false + title: AllowedToolsFilter + additionalProperties: false + required: + - type + - server_label + - server_url + - require_approval + title: OpenAIResponseInputToolMCP + OpenAIResponseInputToolWebSearch: + type: object + properties: + type: + oneOf: + - type: string + const: web_search + - type: string + const: web_search_preview_2025_03_11 + default: web_search + search_context_size: + type: string + default: medium + additionalProperties: false + required: + - type + title: OpenAIResponseInputToolWebSearch + OpenAIResponseMessage: + type: object + properties: + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIResponseInputMessageContent' + - type: array + items: + $ref: '#/components/schemas/OpenAIResponseOutputMessageContent' + role: + oneOf: + - type: string + const: system + - type: string + const: developer + - type: string + const: user + - type: string + const: assistant + type: + type: string + const: message + default: message + id: + type: string + status: + type: string + additionalProperties: false + required: + - content + - role + - type + title: OpenAIResponseMessage + description: >- + Corresponds to the various Message types in the Responses API. They are all + under one type because the Responses API gives them all the same "type" value, + and there is no way to tell them apart in certain scenarios. + OpenAIResponseOutputMessageContent: + type: object + properties: + text: + type: string + type: + type: string + const: output_text + default: output_text + additionalProperties: false + required: + - text + - type + title: >- + OpenAIResponseOutputMessageContentOutputText + "OpenAIResponseOutputMessageFunctionToolCall": + type: object + properties: + call_id: + type: string + name: + type: string + arguments: + type: string + type: + type: string + const: function_call + default: function_call + id: + type: string + status: + type: string + additionalProperties: false + required: + - call_id + - name + - arguments + - type + title: >- + OpenAIResponseOutputMessageFunctionToolCall + "OpenAIResponseOutputMessageWebSearchToolCall": + type: object + properties: + id: + type: string + status: + type: string + type: + type: string + const: web_search_call + default: web_search_call + additionalProperties: false + required: + - id + - status + - type + title: >- + OpenAIResponseOutputMessageWebSearchToolCall + CreateOpenaiResponseRequest: + type: object + properties: + input: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIResponseInput' + description: Input message(s) to create the response. + model: + type: string + description: The underlying LLM used for completions. + instructions: + type: string + previous_response_id: + type: string + description: >- + (Optional) if specified, the new response will be a continuation of the + previous response. This can be used to easily fork-off new responses from + existing responses. + store: + type: boolean + stream: + type: boolean + temperature: + type: number + tools: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseInputTool' + additionalProperties: false + required: + - input + - model + title: CreateOpenaiResponseRequest + OpenAIResponseError: + type: object + properties: + code: + type: string + message: + type: string + additionalProperties: false + required: + - code + - message + title: OpenAIResponseError + OpenAIResponseObject: + type: object + properties: + created_at: + type: integer + error: + $ref: '#/components/schemas/OpenAIResponseError' + id: + type: string + model: + type: string + object: + type: string + const: response + default: response + output: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseOutput' + parallel_tool_calls: + type: boolean + default: false + previous_response_id: + type: string + status: + type: string + temperature: + type: number + top_p: + type: number + truncation: + type: string + user: + type: string + additionalProperties: false + required: + - created_at + - id + - model + - object + - output + - parallel_tool_calls + - status + title: OpenAIResponseObject + OpenAIResponseOutput: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseMessage' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' + discriminator: + propertyName: type + mapping: + message: '#/components/schemas/OpenAIResponseMessage' + web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' + mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' + mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' + OpenAIResponseOutputMessageMCPCall: + type: object + properties: + id: + type: string + type: + type: string + const: mcp_call + default: mcp_call + arguments: + type: string + name: + type: string + server_label: + type: string + error: + type: string + output: + type: string + additionalProperties: false + required: + - id + - type + - arguments + - name + - server_label + title: OpenAIResponseOutputMessageMCPCall + OpenAIResponseOutputMessageMCPListTools: + type: object + properties: + id: + type: string + type: + type: string + const: mcp_list_tools + default: mcp_list_tools + server_label: + type: string + tools: + type: array + items: + type: object + properties: + input_schema: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + name: + type: string + description: + type: string + additionalProperties: false + required: + - input_schema + - name + title: MCPListToolsTool + additionalProperties: false + required: + - id + - type + - server_label + - tools + title: OpenAIResponseOutputMessageMCPListTools + OpenAIResponseObjectStream: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated' + - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta' + - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted' + discriminator: + propertyName: type + mapping: + response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated' + response.output_text.delta: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta' + response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted' + "OpenAIResponseObjectStreamResponseCompleted": + type: object + properties: + response: + $ref: '#/components/schemas/OpenAIResponseObject' + type: + type: string + const: response.completed + default: response.completed + additionalProperties: false + required: + - response + - type + title: >- + OpenAIResponseObjectStreamResponseCompleted + "OpenAIResponseObjectStreamResponseCreated": + type: object + properties: + response: + $ref: '#/components/schemas/OpenAIResponseObject' + type: + type: string + const: response.created + default: response.created + additionalProperties: false + required: + - response + - type + title: >- + OpenAIResponseObjectStreamResponseCreated + "OpenAIResponseObjectStreamResponseOutputTextDelta": + type: object + properties: + content_index: + type: integer + delta: + type: string + item_id: + type: string + output_index: + type: integer + sequence_number: + type: integer + type: + type: string + const: response.output_text.delta + default: response.output_text.delta + additionalProperties: false + required: + - content_index + - delta + - item_id + - output_index + - sequence_number + - type + title: >- + OpenAIResponseObjectStreamResponseOutputTextDelta CreateUploadSessionRequest: type: object properties: bucket: type: string description: >- - Bucket under which the file is stored (valid chars: a-zA-Z0-9_-) + Bucket under which the file is stored (valid chars: a-zA-Z0-9_-). key: type: string description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). mime_type: type: string - description: MIME type of the file + description: MIME type of the file. size: type: integer - description: File size in bytes + description: File size in bytes. additionalProperties: false required: - bucket @@ -4473,7 +5535,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: basic default: basic aggregation_functions: @@ -4483,6 +5545,7 @@ components: additionalProperties: false required: - type + - aggregation_functions title: BasicScoringFnParams BenchmarkConfig: type: object @@ -4522,7 +5585,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: llm_as_judge default: llm_as_judge judge_model: @@ -4541,6 +5604,8 @@ components: required: - type - judge_model + - judge_score_regexes + - aggregation_functions title: LLMAsJudgeScoringFnParams ModelCandidate: type: object @@ -4571,7 +5636,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: regex_parser default: regex_parser parsing_regexes: @@ -4585,6 +5650,8 @@ components: additionalProperties: false required: - type + - parsing_regexes + - aggregation_functions title: RegexParserScoringFnParams ScoringFnParams: oneOf: @@ -4597,6 +5664,13 @@ components: llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' regex_parser: '#/components/schemas/RegexParserScoringFnParams' basic: '#/components/schemas/BasicScoringFnParams' + ScoringFnParamsType: + type: string + enum: + - llm_as_judge + - regex_parser + - basic + title: ScoringFnParamsType EvaluateRowsRequest: type: object properties: @@ -4759,6 +5833,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: benchmark default: benchmark dataset_id: @@ -4780,13 +5864,375 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - dataset_id - scoring_functions - metadata title: Benchmark + OpenAIAssistantMessageParam: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The content of the model's response + name: + type: string + description: >- + (Optional) The name of the assistant message participant. + tool_calls: + type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionToolCall' + description: >- + List of tool calls. Each tool call is an OpenAIChatCompletionToolCall + object. + additionalProperties: false + required: + - role + title: OpenAIAssistantMessageParam + description: >- + A message containing the model's (assistant) response in an OpenAI-compatible + chat completion request. + "OpenAIChatCompletionContentPartImageParam": + type: object + properties: + type: + type: string + const: image_url + default: image_url + image_url: + $ref: '#/components/schemas/OpenAIImageURL' + additionalProperties: false + required: + - type + - image_url + title: >- + OpenAIChatCompletionContentPartImageParam + OpenAIChatCompletionContentPartParam: + oneOf: + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + discriminator: + propertyName: type + mapping: + text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + OpenAIChatCompletionContentPartTextParam: + type: object + properties: + type: + type: string + const: text + default: text + text: + type: string + additionalProperties: false + required: + - type + - text + title: OpenAIChatCompletionContentPartTextParam + OpenAIChatCompletionToolCall: + type: object + properties: + index: + type: integer + id: + type: string + type: + type: string + const: function + default: function + function: + $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction' + additionalProperties: false + required: + - type + title: OpenAIChatCompletionToolCall + OpenAIChatCompletionToolCallFunction: + type: object + properties: + name: + type: string + arguments: + type: string + additionalProperties: false + title: OpenAIChatCompletionToolCallFunction + OpenAIChoice: + type: object + properties: + message: + $ref: '#/components/schemas/OpenAIMessageParam' + description: The message from the model + finish_reason: + type: string + description: The reason the model stopped generating + index: + type: integer + description: The index of the choice + logprobs: + $ref: '#/components/schemas/OpenAIChoiceLogprobs' + description: >- + (Optional) The log probabilities for the tokens in the message + additionalProperties: false + required: + - message + - finish_reason + - index + title: OpenAIChoice + description: >- + A choice from an OpenAI-compatible chat completion response. + OpenAIChoiceLogprobs: + type: object + properties: + content: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message + refusal: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message + additionalProperties: false + title: OpenAIChoiceLogprobs + description: >- + The log probabilities for the tokens in the message from an OpenAI-compatible + chat completion response. + OpenAIDeveloperMessageParam: + type: object + properties: + role: + type: string + const: developer + default: developer + description: >- + Must be "developer" to identify this as a developer message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The content of the developer message + name: + type: string + description: >- + (Optional) The name of the developer message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIDeveloperMessageParam + description: >- + A message from the developer in an OpenAI-compatible chat completion request. + OpenAIImageURL: + type: object + properties: + url: + type: string + detail: + type: string + additionalProperties: false + required: + - url + title: OpenAIImageURL + OpenAIMessageParam: + oneOf: + - $ref: '#/components/schemas/OpenAIUserMessageParam' + - $ref: '#/components/schemas/OpenAISystemMessageParam' + - $ref: '#/components/schemas/OpenAIAssistantMessageParam' + - $ref: '#/components/schemas/OpenAIToolMessageParam' + - $ref: '#/components/schemas/OpenAIDeveloperMessageParam' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/OpenAIUserMessageParam' + system: '#/components/schemas/OpenAISystemMessageParam' + assistant: '#/components/schemas/OpenAIAssistantMessageParam' + tool: '#/components/schemas/OpenAIToolMessageParam' + developer: '#/components/schemas/OpenAIDeveloperMessageParam' + OpenAISystemMessageParam: + type: object + properties: + role: + type: string + const: system + default: system + description: >- + Must be "system" to identify this as a system message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: >- + The content of the "system prompt". If multiple system messages are provided, + they are concatenated. The underlying Llama Stack code may also add other + system messages (for example, for formatting tool definitions). + name: + type: string + description: >- + (Optional) The name of the system message participant. + additionalProperties: false + required: + - role + - content + title: OpenAISystemMessageParam + description: >- + A system message providing instructions or context to the model. + OpenAITokenLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + top_logprobs: + type: array + items: + $ref: '#/components/schemas/OpenAITopLogProb' + additionalProperties: false + required: + - token + - logprob + - top_logprobs + title: OpenAITokenLogProb + description: >- + The log probability for a token from an OpenAI-compatible chat completion + response. + OpenAIToolMessageParam: + type: object + properties: + role: + type: string + const: tool + default: tool + description: >- + Must be "tool" to identify this as a tool response + tool_call_id: + type: string + description: >- + Unique identifier for the tool call this response is for + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The response content from the tool + additionalProperties: false + required: + - role + - tool_call_id + - content + title: OpenAIToolMessageParam + description: >- + A message representing the result of a tool invocation in an OpenAI-compatible + chat completion request. + OpenAITopLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + additionalProperties: false + required: + - token + - logprob + title: OpenAITopLogProb + description: >- + The top log probability for a token from an OpenAI-compatible chat completion + response. + OpenAIUserMessageParam: + type: object + properties: + role: + type: string + const: user + default: user + description: >- + Must be "user" to identify this as a user message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: >- + The content of the message, which can include text and other media + name: + type: string + description: >- + (Optional) The name of the user message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIUserMessageParam + description: >- + A message from the user in an OpenAI-compatible chat completion request. + OpenAICompletionWithInputMessages: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChoice' + description: List of choices + object: + type: string + const: chat.completion + default: chat.completion + description: >- + The object type, which will be "chat.completion" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + input_messages: + type: array + items: + $ref: '#/components/schemas/OpenAIMessageParam' + additionalProperties: false + required: + - id + - choices + - object + - created + - model + - input_messages + title: OpenAICompletionWithInputMessages DataSource: oneOf: - $ref: '#/components/schemas/URIDataSource' @@ -4807,6 +6253,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: dataset default: dataset purpose: @@ -4833,7 +6289,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - purpose @@ -4932,6 +6387,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: model default: model metadata: @@ -4950,7 +6415,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - metadata @@ -5086,6 +6550,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: scoring_function default: scoring_function description: @@ -5107,7 +6581,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - metadata @@ -5146,6 +6619,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: shield default: shield params: @@ -5161,7 +6644,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type title: Shield @@ -5208,8 +6690,10 @@ components: type: array items: type: string + description: The attributes to return in the tree. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false title: GetSpanTreeRequest SpanStatus: @@ -5276,12 +6760,20 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: tool default: tool toolgroup_id: type: string - tool_host: - $ref: '#/components/schemas/ToolHost' description: type: string parameters: @@ -5301,21 +6793,12 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - toolgroup_id - - tool_host - description - parameters title: Tool - ToolHost: - type: string - enum: - - distribution - - client - - model_context_protocol - title: ToolHost ToolGroup: type: object properties: @@ -5327,6 +6810,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: tool_group default: tool_group mcp_endpoint: @@ -5344,7 +6837,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type title: ToolGroup @@ -5458,6 +6950,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: vector_db default: vector_db embedding_model: @@ -5467,7 +6969,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - embedding_model @@ -5546,6 +7047,8 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to insert the chunks into. chunks: type: array items: @@ -5553,6 +7056,9 @@ components: properties: content: $ref: '#/components/schemas/InterleavedContent' + description: >- + The content of the chunk, which can be interleaved text, images, + or other types. metadata: type: object additionalProperties: @@ -5563,13 +7069,32 @@ components: - type: string - type: array - type: object + description: >- + Metadata associated with the chunk, such as document ID, source, + or other relevant information. + embedding: + type: array + items: + type: number + description: >- + Optional embedding for the chunk. If not provided, it will be computed + later. additionalProperties: false required: - content - metadata title: Chunk + description: >- + A chunk of content that can be inserted into a vector database. + description: >- + The chunks to insert. Each `Chunk` should contain content which can be + interleaved text, images, or other types. `metadata`: `dict[str, Any]` + and `embedding`: `List[float]` are optional. If `metadata` is provided, + you configure how Llama Stack formats the chunk during generation. If + `embedding` is not provided, it will be computed later. ttl_seconds: type: integer + description: The time to live of the chunks. additionalProperties: false required: - vector_db_id @@ -5617,6 +7142,7 @@ components: properties: tool_name: type: string + description: The name of the tool to invoke. kwargs: type: object additionalProperties: @@ -5627,6 +7153,8 @@ components: - type: string - type: array - type: object + description: >- + A dictionary of arguments to pass to the tool. additionalProperties: false required: - tool_name @@ -5699,28 +7227,6 @@ components: - job_id - status title: Job - ListAgentSessionsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/Session' - additionalProperties: false - required: - - data - title: ListAgentSessionsResponse - ListAgentsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/Agent' - additionalProperties: false - required: - - data - title: ListAgentsResponse BucketResponse: type: object properties: @@ -5755,6 +7261,73 @@ components: required: - data title: ListBenchmarksResponse + Order: + type: string + enum: + - asc + - desc + title: Order + ListOpenAIChatCompletionResponse: + type: object + properties: + data: + type: array + items: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChoice' + description: List of choices + object: + type: string + const: chat.completion + default: chat.completion + description: >- + The object type, which will be "chat.completion" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + input_messages: + type: array + items: + $ref: '#/components/schemas/OpenAIMessageParam' + additionalProperties: false + required: + - id + - choices + - object + - created + - model + - input_messages + title: OpenAICompletionWithInputMessages + has_more: + type: boolean + first_id: + type: string + last_id: + type: string + object: + type: string + const: list + default: list + additionalProperties: false + required: + - data + - has_more + - first_id + - last_id + - object + title: ListOpenAIChatCompletionResponse ListDatasetsResponse: type: object properties: @@ -5791,6 +7364,96 @@ components: required: - data title: ListModelsResponse + ListOpenAIResponseInputItem: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseInput' + object: + type: string + const: list + default: list + additionalProperties: false + required: + - data + - object + title: ListOpenAIResponseInputItem + ListOpenAIResponseObject: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseObjectWithInput' + has_more: + type: boolean + first_id: + type: string + last_id: + type: string + object: + type: string + const: list + default: list + additionalProperties: false + required: + - data + - has_more + - first_id + - last_id + - object + title: ListOpenAIResponseObject + OpenAIResponseObjectWithInput: + type: object + properties: + created_at: + type: integer + error: + $ref: '#/components/schemas/OpenAIResponseError' + id: + type: string + model: + type: string + object: + type: string + const: response + default: response + output: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseOutput' + parallel_tool_calls: + type: boolean + default: false + previous_response_id: + type: string + status: + type: string + temperature: + type: number + top_p: + type: number + truncation: + type: string + user: + type: string + input: + type: array + items: + $ref: '#/components/schemas/OpenAIResponseInput' + additionalProperties: false + required: + - created_at + - id + - model + - object + - output + - parallel_tool_calls + - status + - input + title: OpenAIResponseObjectWithInput ListProvidersResponse: type: object properties: @@ -5907,6 +7570,13 @@ components: unstructured_log: '#/components/schemas/UnstructuredLogEvent' metric: '#/components/schemas/MetricEvent' structured_log: '#/components/schemas/StructuredLogEvent' + EventType: + type: string + enum: + - unstructured_log + - structured_log + - metric + title: EventType LogSeverity: type: string enum: @@ -5937,7 +7607,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: metric default: metric metric: @@ -5962,7 +7632,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/StructuredLogType' const: span_end default: span_end status: @@ -5976,7 +7646,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/StructuredLogType' const: span_start default: span_start name: @@ -6008,7 +7678,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: structured_log default: structured_log payload: @@ -6030,6 +7700,12 @@ components: mapping: span_start: '#/components/schemas/SpanStartPayload' span_end: '#/components/schemas/SpanEndPayload' + StructuredLogType: + type: string + enum: + - span_start + - span_end + title: StructuredLogType UnstructuredLogEvent: type: object properties: @@ -6050,7 +7726,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: unstructured_log default: unstructured_log message: @@ -6071,149 +7747,15 @@ components: properties: event: $ref: '#/components/schemas/Event' + description: The event to log. ttl_seconds: type: integer + description: The time to live of the event. additionalProperties: false required: - event - ttl_seconds title: LogEventRequest - OpenAIAssistantMessageParam: - type: object - properties: - role: - type: string - const: assistant - default: assistant - description: >- - Must be "assistant" to identify this as the model's response - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The content of the model's response - name: - type: string - description: >- - (Optional) The name of the assistant message participant. - tool_calls: - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionToolCall' - description: >- - List of tool calls. Each tool call is an OpenAIChatCompletionToolCall - object. - additionalProperties: false - required: - - role - title: OpenAIAssistantMessageParam - description: >- - A message containing the model's (assistant) response in an OpenAI-compatible - chat completion request. - "OpenAIChatCompletionContentPartImageParam": - type: object - properties: - type: - type: string - const: image_url - default: image_url - image_url: - $ref: '#/components/schemas/OpenAIImageURL' - additionalProperties: false - required: - - type - - image_url - title: >- - OpenAIChatCompletionContentPartImageParam - OpenAIChatCompletionContentPartParam: - oneOf: - - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' - - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' - discriminator: - propertyName: type - mapping: - text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' - image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' - OpenAIChatCompletionContentPartTextParam: - type: object - properties: - type: - type: string - const: text - default: text - text: - type: string - additionalProperties: false - required: - - type - - text - title: OpenAIChatCompletionContentPartTextParam - OpenAIChatCompletionToolCall: - type: object - properties: - index: - type: integer - id: - type: string - type: - type: string - const: function - default: function - function: - $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction' - additionalProperties: false - required: - - type - title: OpenAIChatCompletionToolCall - OpenAIChatCompletionToolCallFunction: - type: object - properties: - name: - type: string - arguments: - type: string - additionalProperties: false - title: OpenAIChatCompletionToolCallFunction - OpenAIDeveloperMessageParam: - type: object - properties: - role: - type: string - const: developer - default: developer - description: >- - Must be "developer" to identify this as a developer message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The content of the developer message - name: - type: string - description: >- - (Optional) The name of the developer message participant. - additionalProperties: false - required: - - role - - content - title: OpenAIDeveloperMessageParam - description: >- - A message from the developer in an OpenAI-compatible chat completion request. - OpenAIImageURL: - type: object - properties: - url: - type: string - detail: - type: string - additionalProperties: false - required: - - url - title: OpenAIImageURL OpenAIJSONSchema: type: object properties: @@ -6237,21 +7779,6 @@ components: required: - name title: OpenAIJSONSchema - OpenAIMessageParam: - oneOf: - - $ref: '#/components/schemas/OpenAIUserMessageParam' - - $ref: '#/components/schemas/OpenAISystemMessageParam' - - $ref: '#/components/schemas/OpenAIAssistantMessageParam' - - $ref: '#/components/schemas/OpenAIToolMessageParam' - - $ref: '#/components/schemas/OpenAIDeveloperMessageParam' - discriminator: - propertyName: role - mapping: - user: '#/components/schemas/OpenAIUserMessageParam' - system: '#/components/schemas/OpenAISystemMessageParam' - assistant: '#/components/schemas/OpenAIAssistantMessageParam' - tool: '#/components/schemas/OpenAIToolMessageParam' - developer: '#/components/schemas/OpenAIDeveloperMessageParam' OpenAIResponseFormatJSONObject: type: object properties: @@ -6299,93 +7826,6 @@ components: required: - type title: OpenAIResponseFormatText - OpenAISystemMessageParam: - type: object - properties: - role: - type: string - const: system - default: system - description: >- - Must be "system" to identify this as a system message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: >- - The content of the "system prompt". If multiple system messages are provided, - they are concatenated. The underlying Llama Stack code may also add other - system messages (for example, for formatting tool definitions). - name: - type: string - description: >- - (Optional) The name of the system message participant. - additionalProperties: false - required: - - role - - content - title: OpenAISystemMessageParam - description: >- - A system message providing instructions or context to the model. - OpenAIToolMessageParam: - type: object - properties: - role: - type: string - const: tool - default: tool - description: >- - Must be "tool" to identify this as a tool response - tool_call_id: - type: string - description: >- - Unique identifier for the tool call this response is for - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The response content from the tool - additionalProperties: false - required: - - role - - tool_call_id - - content - title: OpenAIToolMessageParam - description: >- - A message representing the result of a tool invocation in an OpenAI-compatible - chat completion request. - OpenAIUserMessageParam: - type: object - properties: - role: - type: string - const: user - default: user - description: >- - Must be "user" to identify this as a user message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: >- - The content of the message, which can include text and other media - name: - type: string - description: >- - (Optional) The name of the user message participant. - additionalProperties: false - required: - - role - - content - title: OpenAIUserMessageParam - description: >- - A message from the user in an OpenAI-compatible chat completion request. OpenaiChatCompletionRequest: type: object properties: @@ -6398,11 +7838,11 @@ components: type: array items: $ref: '#/components/schemas/OpenAIMessageParam' - description: List of messages in the conversation + description: List of messages in the conversation. frequency_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. function_call: oneOf: - type: string @@ -6415,7 +7855,7 @@ components: - type: string - type: array - type: object - description: (Optional) The function call to use + description: (Optional) The function call to use. functions: type: array items: @@ -6428,52 +7868,52 @@ components: - type: string - type: array - type: object - description: (Optional) List of functions to use + description: (Optional) List of functions to use. logit_bias: type: object additionalProperties: type: number - description: (Optional) The logit bias to use + description: (Optional) The logit bias to use. logprobs: type: boolean - description: (Optional) The log probabilities to use + description: (Optional) The log probabilities to use. max_completion_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. max_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. n: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. parallel_tool_calls: type: boolean description: >- - (Optional) Whether to parallelize tool calls + (Optional) Whether to parallelize tool calls. presence_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. response_format: $ref: '#/components/schemas/OpenAIResponseFormatParam' - description: (Optional) The response format to use + description: (Optional) The response format to use. seed: type: integer - description: (Optional) The seed to use + description: (Optional) The seed to use. stop: oneOf: - type: string - type: array items: type: string - description: (Optional) The stop tokens to use + description: (Optional) The stop tokens to use. stream: type: boolean description: >- - (Optional) Whether to stream the response + (Optional) Whether to stream the response. stream_options: type: object additionalProperties: @@ -6484,10 +7924,10 @@ components: - type: string - type: array - type: object - description: (Optional) The stream options to use + description: (Optional) The stream options to use. temperature: type: number - description: (Optional) The temperature to use + description: (Optional) The temperature to use. tool_choice: oneOf: - type: string @@ -6500,7 +7940,7 @@ components: - type: string - type: array - type: object - description: (Optional) The tool choice to use + description: (Optional) The tool choice to use. tools: type: array items: @@ -6513,17 +7953,17 @@ components: - type: string - type: array - type: object - description: (Optional) The tools to use + description: (Optional) The tools to use. top_logprobs: type: integer description: >- - (Optional) The top log probabilities to use + (Optional) The top log probabilities to use. top_p: type: number - description: (Optional) The top p to use + description: (Optional) The top p to use. user: type: string - description: (Optional) The user to use + description: (Optional) The user to use. additionalProperties: false required: - model @@ -6599,30 +8039,6 @@ components: title: OpenAIChatCompletionChunk description: >- Chunk from a streaming response to an OpenAI-compatible chat completion request. - OpenAIChoice: - type: object - properties: - message: - $ref: '#/components/schemas/OpenAIMessageParam' - description: The message from the model - finish_reason: - type: string - description: The reason the model stopped generating - index: - type: integer - description: The index of the choice - logprobs: - $ref: '#/components/schemas/OpenAIChoiceLogprobs' - description: >- - (Optional) The log probabilities for the tokens in the message - additionalProperties: false - required: - - message - - finish_reason - - index - title: OpenAIChoice - description: >- - A choice from an OpenAI-compatible chat completion response. OpenAIChoiceDelta: type: object properties: @@ -6644,26 +8060,6 @@ components: title: OpenAIChoiceDelta description: >- A delta from an OpenAI-compatible chat completion streaming response. - OpenAIChoiceLogprobs: - type: object - properties: - content: - type: array - items: - $ref: '#/components/schemas/OpenAITokenLogProb' - description: >- - (Optional) The log probabilities for the tokens in the message - refusal: - type: array - items: - $ref: '#/components/schemas/OpenAITokenLogProb' - description: >- - (Optional) The log probabilities for the tokens in the message - additionalProperties: false - title: OpenAIChoiceLogprobs - description: >- - The log probabilities for the tokens in the message from an OpenAI-compatible - chat completion response. OpenAIChunkChoice: type: object properties: @@ -6688,49 +8084,6 @@ components: title: OpenAIChunkChoice description: >- A chunk choice from an OpenAI-compatible chat completion streaming response. - OpenAITokenLogProb: - type: object - properties: - token: - type: string - bytes: - type: array - items: - type: integer - logprob: - type: number - top_logprobs: - type: array - items: - $ref: '#/components/schemas/OpenAITopLogProb' - additionalProperties: false - required: - - token - - logprob - - top_logprobs - title: OpenAITokenLogProb - description: >- - The log probability for a token from an OpenAI-compatible chat completion - response. - OpenAITopLogProb: - type: object - properties: - token: - type: string - bytes: - type: array - items: - type: integer - logprob: - type: number - additionalProperties: false - required: - - token - - logprob - title: OpenAITopLogProb - description: >- - The top log probability for a token from an OpenAI-compatible chat completion - response. OpenaiCompletionRequest: type: object properties: @@ -6753,52 +8106,52 @@ components: type: array items: type: integer - description: The prompt to generate a completion for + description: The prompt to generate a completion for. best_of: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. echo: type: boolean - description: (Optional) Whether to echo the prompt + description: (Optional) Whether to echo the prompt. frequency_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. logit_bias: type: object additionalProperties: type: number - description: (Optional) The logit bias to use + description: (Optional) The logit bias to use. logprobs: type: boolean - description: (Optional) The log probabilities to use + description: (Optional) The log probabilities to use. max_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. n: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. presence_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. seed: type: integer - description: (Optional) The seed to use + description: (Optional) The seed to use. stop: oneOf: - type: string - type: array items: type: string - description: (Optional) The stop tokens to use + description: (Optional) The stop tokens to use. stream: type: boolean description: >- - (Optional) Whether to stream the response + (Optional) Whether to stream the response. stream_options: type: object additionalProperties: @@ -6809,16 +8162,16 @@ components: - type: string - type: array - type: object - description: (Optional) The stream options to use + description: (Optional) The stream options to use. temperature: type: number - description: (Optional) The temperature to use + description: (Optional) The temperature to use. top_p: type: number - description: (Optional) The top p to use + description: (Optional) The top p to use. user: type: string - description: (Optional) The user to use + description: (Optional) The user to use. guided_choice: type: array items: @@ -6876,6 +8229,118 @@ components: title: OpenAICompletionChoice description: >- A choice from an OpenAI-compatible completion response. + OpenaiEmbeddingsRequest: + type: object + properties: + model: + type: string + description: >- + The identifier of the model to use. The model must be an embedding model + registered with Llama Stack and available via the /models endpoint. + input: + oneOf: + - type: string + - type: array + items: + type: string + description: >- + Input text to embed, encoded as a string or array of strings. To embed + multiple inputs in a single request, pass an array of strings. + encoding_format: + type: string + description: >- + (Optional) The format to return the embeddings in. Can be either "float" + or "base64". Defaults to "float". + dimensions: + type: integer + description: >- + (Optional) The number of dimensions the resulting output embeddings should + have. Only supported in text-embedding-3 and later models. + user: + type: string + description: >- + (Optional) A unique identifier representing your end-user, which can help + OpenAI to monitor and detect abuse. + additionalProperties: false + required: + - model + - input + title: OpenaiEmbeddingsRequest + OpenAIEmbeddingData: + type: object + properties: + object: + type: string + const: embedding + default: embedding + description: >- + The object type, which will be "embedding" + embedding: + oneOf: + - type: array + items: + type: number + - type: string + description: >- + The embedding vector as a list of floats (when encoding_format="float") + or as a base64-encoded string (when encoding_format="base64") + index: + type: integer + description: >- + The index of the embedding in the input list + additionalProperties: false + required: + - object + - embedding + - index + title: OpenAIEmbeddingData + description: >- + A single embedding data object from an OpenAI-compatible embeddings response. + OpenAIEmbeddingUsage: + type: object + properties: + prompt_tokens: + type: integer + description: The number of tokens in the input + total_tokens: + type: integer + description: The total number of tokens used + additionalProperties: false + required: + - prompt_tokens + - total_tokens + title: OpenAIEmbeddingUsage + description: >- + Usage information for an OpenAI-compatible embeddings response. + OpenAIEmbeddingsResponse: + type: object + properties: + object: + type: string + const: list + default: list + description: The object type, which will be "list" + data: + type: array + items: + $ref: '#/components/schemas/OpenAIEmbeddingData' + description: List of embedding data objects + model: + type: string + description: >- + The model that was used to generate the embeddings + usage: + $ref: '#/components/schemas/OpenAIEmbeddingUsage' + description: Usage information + additionalProperties: false + required: + - object + - data + - model + - usage + title: OpenAIEmbeddingsResponse + description: >- + Response from an OpenAI-compatible embeddings request. OpenAIModel: type: object properties: @@ -7034,12 +8499,16 @@ components: properties: job_uuid: type: string + description: The UUID of the job to create. finetuned_model: type: string + description: The model to fine-tune. algorithm_config: $ref: '#/components/schemas/DPOAlignmentConfig' + description: The algorithm configuration. training_config: $ref: '#/components/schemas/TrainingConfig' + description: The training configuration. hyperparam_search_config: type: object additionalProperties: @@ -7050,6 +8519,7 @@ components: - type: string - type: array - type: object + description: The hyperparam search configuration. logger_config: type: object additionalProperties: @@ -7060,6 +8530,7 @@ components: - type: string - type: array - type: object + description: The logger configuration. additionalProperties: false required: - job_uuid @@ -7115,18 +8586,41 @@ components: properties: query_generator_config: $ref: '#/components/schemas/RAGQueryGeneratorConfig' + description: Configuration for the query generator. max_tokens_in_context: type: integer default: 4096 + description: Maximum number of tokens in the context. max_chunks: type: integer default: 5 + description: Maximum number of chunks to retrieve. + chunk_template: + type: string + default: > + Result {index} + + Content: {chunk.content} + + Metadata: {metadata} + description: >- + Template for formatting each retrieved chunk in the context. Available + placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk + content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent: + {chunk.content}\nMetadata: {metadata}\n" + mode: + type: string + description: >- + Search mode for retrieval—either "vector" or "keyword". Default "vector". additionalProperties: false required: - query_generator_config - max_tokens_in_context - max_chunks + - chunk_template title: RAGQueryConfig + description: >- + Configuration for the RAG query generation. RAGQueryGeneratorConfig: oneOf: - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig' @@ -7176,8 +8670,11 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to query. query: $ref: '#/components/schemas/InterleavedContent' + description: The query to search for. params: type: object additionalProperties: @@ -7188,6 +8685,7 @@ components: - type: string - type: array - type: object + description: The parameters of the query. additionalProperties: false required: - vector_db_id @@ -7203,6 +8701,9 @@ components: properties: content: $ref: '#/components/schemas/InterleavedContent' + description: >- + The content of the chunk, which can be interleaved text, images, + or other types. metadata: type: object additionalProperties: @@ -7213,11 +8714,23 @@ components: - type: string - type: array - type: object + description: >- + Metadata associated with the chunk, such as document ID, source, + or other relevant information. + embedding: + type: array + items: + type: number + description: >- + Optional embedding for the chunk. If not provided, it will be computed + later. additionalProperties: false required: - content - metadata title: Chunk + description: >- + A chunk of content that can be inserted into a vector database. scores: type: array items: @@ -7227,6 +8740,109 @@ components: - chunks - scores title: QueryChunksResponse + QueryMetricsRequest: + type: object + properties: + start_time: + type: integer + description: The start time of the metric to query. + end_time: + type: integer + description: The end time of the metric to query. + granularity: + type: string + description: The granularity of the metric to query. + query_type: + type: string + enum: + - range + - instant + description: The type of query to perform. + label_matchers: + type: array + items: + type: object + properties: + name: + type: string + value: + type: string + operator: + type: string + enum: + - '=' + - '!=' + - =~ + - '!~' + title: MetricLabelOperator + default: '=' + additionalProperties: false + required: + - name + - value + - operator + title: MetricLabelMatcher + description: >- + The label matchers to apply to the metric. + additionalProperties: false + required: + - start_time + - query_type + title: QueryMetricsRequest + MetricDataPoint: + type: object + properties: + timestamp: + type: integer + value: + type: number + additionalProperties: false + required: + - timestamp + - value + title: MetricDataPoint + MetricLabel: + type: object + properties: + name: + type: string + value: + type: string + additionalProperties: false + required: + - name + - value + title: MetricLabel + MetricSeries: + type: object + properties: + metric: + type: string + labels: + type: array + items: + $ref: '#/components/schemas/MetricLabel' + values: + type: array + items: + $ref: '#/components/schemas/MetricDataPoint' + additionalProperties: false + required: + - metric + - labels + - values + title: MetricSeries + QueryMetricsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/MetricSeries' + additionalProperties: false + required: + - data + title: QueryMetricsResponse QueryCondition: type: object properties: @@ -7263,12 +8879,16 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the spans. attributes_to_return: type: array items: type: string + description: The attributes to return in the spans. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false required: - attribute_filters @@ -7292,14 +8912,19 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the traces. limit: type: integer + description: The limit of traces to return. offset: type: integer + description: The offset of the traces to return. order_by: type: array items: type: string + description: The order by of the traces to return. additionalProperties: false title: QueryTracesRequest QueryTracesResponse: @@ -7318,16 +8943,25 @@ components: properties: benchmark_id: type: string + description: The ID of the benchmark to register. dataset_id: type: string + description: >- + The ID of the dataset to use for the benchmark. scoring_functions: type: array items: type: string + description: >- + The scoring functions to use for the benchmark. provider_benchmark_id: type: string + description: >- + The ID of the provider benchmark to use for the benchmark. provider_id: type: string + description: >- + The ID of the provider to use for the benchmark. metadata: type: object additionalProperties: @@ -7338,6 +8972,7 @@ components: - type: string - type: array - type: object + description: The metadata to use for the benchmark. additionalProperties: false required: - benchmark_id @@ -7354,7 +8989,7 @@ components: - eval/question-answer - eval/messages-answer description: >- - The purpose of the dataset. One of - "post-training/messages": The dataset + The purpose of the dataset. One of: - "post-training/messages": The dataset contains a messages column with list of messages for post-training. { "messages": [ {"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } - "eval/question-answer": The dataset @@ -7387,7 +9022,7 @@ components: - type: array - type: object description: >- - The metadata for the dataset. - E.g. {"description": "My dataset"} + The metadata for the dataset. - E.g. {"description": "My dataset"}. dataset_id: type: string description: >- @@ -7402,10 +9037,14 @@ components: properties: model_id: type: string + description: The identifier of the model to register. provider_model_id: type: string + description: >- + The identifier of the model in the provider. provider_id: type: string + description: The identifier of the provider. metadata: type: object additionalProperties: @@ -7416,8 +9055,10 @@ components: - type: string - type: array - type: object + description: Any additional metadata for this model. model_type: $ref: '#/components/schemas/ModelType' + description: The type of model to register. additionalProperties: false required: - model_id @@ -7427,16 +9068,27 @@ components: properties: scoring_fn_id: type: string + description: >- + The ID of the scoring function to register. description: type: string + description: The description of the scoring function. return_type: $ref: '#/components/schemas/ParamType' + description: The return type of the scoring function. provider_scoring_fn_id: type: string + description: >- + The ID of the provider scoring function to use for the scoring function. provider_id: type: string + description: >- + The ID of the provider to use for the scoring function. params: $ref: '#/components/schemas/ScoringFnParams' + description: >- + The parameters for the scoring function for benchmark eval, these can + be overridden for app eval. additionalProperties: false required: - scoring_fn_id @@ -7448,10 +9100,15 @@ components: properties: shield_id: type: string + description: >- + The identifier of the shield to register. provider_shield_id: type: string + description: >- + The identifier of the shield in the provider. provider_id: type: string + description: The identifier of the provider. params: type: object additionalProperties: @@ -7462,6 +9119,7 @@ components: - type: string - type: array - type: object + description: The parameters of the shield. additionalProperties: false required: - shield_id @@ -7471,10 +9129,15 @@ components: properties: toolgroup_id: type: string + description: The ID of the tool group to register. provider_id: type: string + description: >- + The ID of the provider to use for the tool group. mcp_endpoint: $ref: '#/components/schemas/URL' + description: >- + The MCP endpoint to use for the tool group. args: type: object additionalProperties: @@ -7485,6 +9148,8 @@ components: - type: string - type: array - type: object + description: >- + A dictionary of arguments to pass to the tool group. additionalProperties: false required: - toolgroup_id @@ -7495,14 +9160,21 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to register. embedding_model: type: string + description: The embedding model to use. embedding_dimension: type: integer + description: The dimension of the embedding model. provider_id: type: string + description: The identifier of the provider. provider_vector_db_id: type: string + description: >- + The identifier of the vector database in the provider. additionalProperties: false required: - vector_db_id @@ -7539,10 +9211,12 @@ components: properties: shield_id: type: string + description: The identifier of the shield to run. messages: type: array items: $ref: '#/components/schemas/Message' + description: The messages to run the shield on. params: type: object additionalProperties: @@ -7553,6 +9227,7 @@ components: - type: string - type: array - type: object + description: The parameters of the shield. additionalProperties: false required: - shield_id @@ -7573,14 +9248,20 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the spans. attributes_to_save: type: array items: type: string + description: The attributes to save to the dataset. dataset_id: type: string + description: >- + The ID of the dataset to save the spans to. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false required: - attribute_filters @@ -7635,14 +9316,19 @@ components: properties: dataset_id: type: string + description: The ID of the dataset to score. scoring_functions: type: object additionalProperties: oneOf: - $ref: '#/components/schemas/ScoringFnParams' - type: 'null' + description: >- + The scoring functions to use for the scoring. save_results_dataset: type: boolean + description: >- + Whether to save the results to a dataset. additionalProperties: false required: - dataset_id @@ -7727,8 +9413,10 @@ components: properties: job_uuid: type: string + description: The UUID of the job to create. training_config: $ref: '#/components/schemas/TrainingConfig' + description: The training configuration. hyperparam_search_config: type: object additionalProperties: @@ -7739,6 +9427,7 @@ components: - type: string - type: array - type: object + description: The hyperparam search configuration. logger_config: type: object additionalProperties: @@ -7749,12 +9438,16 @@ components: - type: string - type: array - type: object + description: The logger configuration. model: type: string + description: The model to fine-tune. checkpoint_dir: type: string + description: The directory to save checkpoint(s) to. algorithm_config: $ref: '#/components/schemas/AlgorithmConfig' + description: The algorithm configuration. additionalProperties: false required: - job_uuid diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index b764d4d34..cdaf074b8 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -1050,8 +1050,6 @@ "text/html": [ "
ToolGroup(\n",
               "identifier='builtin::code_interpreter',\n",
-              "provider_id='code-interpreter',\n",
-              "provider_resource_id='builtin::code_interpreter',\n",
               "type='tool_group',\n",
               "args=None,\n",
               "mcp_endpoint=None\n",
@@ -1061,7 +1059,6 @@
             "text/plain": [
               "\u001b[1;35mToolGroup\u001b[0m\u001b[1m(\u001b[0m\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33midentifier\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
-              "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_id\u001b[0m=\u001b[32m'code-interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_resource_id\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'tool_group'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33margs\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
diff --git a/docs/getting_started_llama_api.ipynb b/docs/getting_started_llama_api.ipynb
new file mode 100644
index 000000000..128e9114a
--- /dev/null
+++ b/docs/getting_started_llama_api.ipynb
@@ -0,0 +1,907 @@
+{
+    "cells": [
+      {
+        "cell_type": "markdown",
+        "id": "c1e7571c",
+        "metadata": {
+          "id": "c1e7571c"
+        },
+        "source": [
+          "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
+          "\n",
+          "# Getting Started with Llama 4 in Llama Stack\n",
+          "\n",
+          "\"drawing\"\n",
+          "\n",
+          "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
+          "\n",
+          "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+          "\n",
+          "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "4CV1Q19BDMVw",
+        "metadata": {
+          "id": "4CV1Q19BDMVw"
+        },
+        "source": [
+          "## 1. Getting started with Llama Stack"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "K4AvfUAJZOeS",
+        "metadata": {
+          "id": "K4AvfUAJZOeS"
+        },
+        "source": [
+          "### 1.1. Create Llama API account\n",
+          "\n",
+          "In this showcase, we will use [Llama API](https://llama.developer.meta.com/) as the inference provider. So, you would first get an API key from Llama API if you don't have one already.\n",
+          "\n",
+          "\n",
+          "\n",
+          "> **Note:**  Set the API Key in the Secrets of this notebook\n",
+          "\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "oDUB7M_qe-Gs",
+        "metadata": {
+          "id": "oDUB7M_qe-Gs"
+        },
+        "source": [
+          "### 1.2. Setup and Running a Llama Stack server\n",
+          "\n",
+          "Llama Stack is architected as a collection of APIs that provide developers with the building blocks to build AI applications. \n",
+          "\n",
+          "Llama stack is typically available as a server with an endpoint that you can make calls to. Partners like Together and Fireworks offer their own Llama Stack compatible endpoints.\n",
+          "\n",
+          "In this showcase, we will start a Llama Stack server that is running locally.\n"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": null,
+        "id": "J2kGed0R5PSf",
+        "metadata": {
+          "colab": {
+            "base_uri": "https://localhost:8080/"
+          },
+          "collapsed": true,
+          "id": "J2kGed0R5PSf",
+          "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "Requirement already satisfied: uv in /opt/homebrew/Caskroom/miniconda/base/envs/l4/lib/python3.10/site-packages (0.6.12)\n",
+              "\u001b[2mUsing Python 3.10.16 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/l4\u001b[0m\n",
+              "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 83ms\u001b[0m\u001b[0m\n",
+              "Environment '/Users/erichuang/projects/internal-llama-stack/.venv' already exists, re-using it.\n",
+              "Virtual environment /Users/erichuang/projects/internal-llama-stack/.venv is already active\n",
+              "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n",
+              "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 387ms\u001b[0m\u001b[0m\n",
+              "Installing pip dependencies\n",
+              "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n",
+              "\u001b[2K\u001b[2mResolved \u001b[1m123 packages\u001b[0m \u001b[2min 1.13s\u001b[0m\u001b[0m                                       \u001b[0m\n",
+              "\u001b[2K\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)                                                   \n",
+              "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)-----\u001b[0m\u001b[0m     0 B/9.53 KiB                     \u001b[1A\n",
+              "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)-\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB                    \u001b[1A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2K\u001b[2A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m     0 B/44.00 KiB                     \u001b[2A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2K\u001b[2A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB                   \u001b[2A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m\u001b[2m------------------------------\u001b[0m\u001b[0m     0 B/34.43 KiB\n",
+              "\u001b[2K\u001b[3A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB                   \u001b[3A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2K\u001b[3A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB                   \u001b[3A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m\u001b[2m------------------------------\u001b[0m\u001b[0m     0 B/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB                   \u001b[4A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB                   \u001b[4A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m     0 B/85.81 KiB                     \u001b[5A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB                   \u001b[5A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m     0 B/3.08 MiB                      \u001b[6A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB                    \u001b[6A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m---------------------------\u001b[2m---\u001b[0m\u001b[0m 30.83 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB                    \u001b[6A\n",
+              "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB                    \u001b[6A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB                    \u001b[5A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtabulate  \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 30.91 KiB/3.08 MiB                    \u001b[5A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 30.91 KiB/3.08 MiB                    \u001b[4A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 46.91 KiB/3.08 MiB                    \u001b[4A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 62.91 KiB/3.08 MiB                    \u001b[4A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 78.91 KiB/3.08 MiB                    \u001b[4A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 94.91 KiB/3.08 MiB                    \u001b[4A\n",
+              "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB                     \u001b[4A\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m----------------------\u001b[2m--------\u001b[0m\u001b[0m 30.88 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[3A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB                     \u001b[3A\n",
+              "\u001b[2mtyper     \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 44.00 KiB/44.00 KiB\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[3A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB                     \u001b[3A\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[2A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 2.80 MiB/3.08 MiB                     \u001b[2A\n",
+              "\u001b[2mtogether  \u001b[0m \u001b[32m-----------------\u001b[2m-------------\u001b[0m\u001b[0m 48.00 KiB/85.81 KiB\n",
+              "\u001b[2K\u001b[2A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 2.81 MiB/3.08 MiB                     \u001b[2A\n",
+              "\u001b[2K\u001b[1A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 48.00 KiB/85.81 KiB                   \u001b[1A\n",
+              "\u001b[2K\u001b[1A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 80.00 KiB/85.81 KiB                   \u001b[1A\n",
+              "\u001b[2K\u001b[2mPrepared \u001b[1m6 packages\u001b[0m \u001b[2min 365ms\u001b[0m\u001b[0m                                                 \u001b[1A\n",
+              "\u001b[2K\u001b[2mInstalled \u001b[1m6 packages\u001b[0m \u001b[2min 50ms\u001b[0m\u001b[0m                                \u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1meval-type-backport\u001b[0m\u001b[2m==0.2.2\u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1mfaiss-cpu\u001b[0m\u001b[2m==1.10.0\u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1mshellingham\u001b[0m\u001b[2m==1.5.4\u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1mtabulate\u001b[0m\u001b[2m==0.9.0\u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1mtogether\u001b[0m\u001b[2m==1.5.5\u001b[0m\n",
+              " \u001b[32m+\u001b[39m \u001b[1mtyper\u001b[0m\u001b[2m==0.15.2\u001b[0m\n",
+              "torch torchvision --index-url https://download.pytorch.org/whl/cpu\n",
+              "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n",
+              "\u001b[2mAudited \u001b[1m2 packages\u001b[0m \u001b[2min 32ms\u001b[0m\u001b[0m\n",
+              "sentence-transformers --no-deps\n",
+              "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n",
+              "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 63ms\u001b[0m\u001b[0m\n",
+              "\u001b[32mBuild Successful!\u001b[0m\n"
+            ]
+          }
+        ],
+        "source": [
+          "import os \n",
+          "import subprocess\n",
+          "import time\n",
+          "\n",
+          "!pip install uv \n",
+          "!uv pip install requests\n",
+          "\n",
+          "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
+          "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
+          "\n",
+          "# this command installs all the dependencies needed for the llama stack server \n",
+          "!uv run --with llama-stack llama stack build --template llama_api --image-type venv \n",
+          "\n",
+          "def run_llama_stack_server_background():\n",
+          "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
+          "    process = subprocess.Popen(\n",
+          "        \"uv run --with llama-stack llama stack run llama_api --image-type venv\",\n",
+          "        shell=True,\n",
+          "        stdout=log_file,\n",
+          "        stderr=log_file,\n",
+          "        text=True\n",
+          "    )\n",
+          "    \n",
+          "    print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
+          "    return process\n",
+          "\n",
+          "def wait_for_server_to_start():\n",
+          "    import requests\n",
+          "    from requests.exceptions import ConnectionError\n",
+          "    import time\n",
+          "    \n",
+          "    url = \"http://0.0.0.0:8321/v1/health\"\n",
+          "    max_retries = 30\n",
+          "    retry_interval = 1\n",
+          "    \n",
+          "    print(\"Waiting for server to start\", end=\"\")\n",
+          "    for _ in range(max_retries):\n",
+          "        try:\n",
+          "            response = requests.get(url)\n",
+          "            if response.status_code == 200:\n",
+          "                print(\"\\nServer is ready!\")\n",
+          "                return True\n",
+          "        except ConnectionError:\n",
+          "            print(\".\", end=\"\", flush=True)\n",
+          "            time.sleep(retry_interval)\n",
+          "            \n",
+          "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
+          "    return False\n",
+          "\n",
+          "\n",
+          "# use this helper if needed to kill the server \n",
+          "def kill_llama_stack_server():\n",
+          "    # Kill any existing llama stack server processes\n",
+          "    os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "c40e9efd",
+        "metadata": {},
+        "source": [
+          "### 1.3 Starting the Llama Stack Server"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": null,
+        "id": "f779283d",
+        "metadata": {},
+        "outputs": [],
+        "source": [
+          "server_process = run_llama_stack_server_background()\n",
+          "assert wait_for_server_to_start()"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "90eb721b",
+        "metadata": {},
+        "source": [
+          "### 1.4 Install and Configure the Client\n",
+          "\n",
+          "Now that we have our Llama Stack server running locally, we need to install the client package to interact with it. The `llama-stack-client` provides a simple Python interface to access all the functionality of Llama Stack, including:\n",
+          "\n",
+          "- Chat Completions ( text and multimodal )\n",
+          "- Safety Shields \n",
+          "- Agent capabilities with tools like web search, RAG with Telemetry\n",
+          "- Evaluation and scoring frameworks\n",
+          "\n",
+          "The client handles all the API communication with our local server, making it easy to integrate Llama Stack's capabilities into your applications.\n",
+          "\n",
+          "In the next cells, we'll:\n",
+          "\n",
+          "1. Install the client package\n",
+          "2. Set up API keys for external services (Together AI and Tavily Search)\n",
+          "3. Initialize the client to connect to our local server\n"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 3,
+        "id": "2e68e32a",
+        "metadata": {},
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "\u001b[2mUsing Python 3.10.16 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/stack\u001b[0m\n",
+              "\u001b[2K\u001b[2mResolved \u001b[1m31 packages\u001b[0m \u001b[2min 284ms\u001b[0m\u001b[0m                                        \u001b[0m\n",
+              "\u001b[2mAudited \u001b[1m31 packages\u001b[0m \u001b[2min 0.04ms\u001b[0m\u001b[0m\n"
+            ]
+          }
+        ],
+        "source": [
+          "!pip install -U llama-stack-client"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 3,
+        "id": "E1UFuJC570Tk",
+        "metadata": {
+          "colab": {
+            "base_uri": "https://localhost:8080/",
+            "height": 1000,
+            "referenced_widgets": [
+              "75307e3dee604d30aa44713e6e293e64",
+              "5ce87402a79342af995df41ac3940d55",
+              "fbbcc19886cc43b38424fbb184162c61",
+              "29212208db6b432eb4f708cd64258954",
+              "50dd8994a4cf486ebbec5ffd4322992a",
+              "f9b768c703494dd198f2978aff4892e8",
+              "1231b9e4cab34c33a38bee63543f1e75",
+              "754deb3970604d48a522bc9f021ad945",
+              "f6ecca7a1a8340fbbe056235a2714fc3",
+              "ef4f63fe9d8f4683a9d20becb6e4e2cb",
+              "7508f10c13634e7aa682cfb29c48d9e7",
+              "26f1430ca7cb4ad5b1b8df1ffdbd32a9",
+              "7cd2d9c9ea7b4d70902ffaff33033078",
+              "101288236cff40b8bb9dbad80dbbc7ee",
+              "d5c9977838a249eeab6ef628279b8155",
+              "d032d1e7b4b54ba28ac83c1a12b23876",
+              "321fce57c158432abeae496ae8a947aa",
+              "3ebe00201bdb4e119e3b74f684a58345",
+              "0f8bab6b8ed04774b386fe952aae66f1",
+              "cfcb6e456c354d99be91f161552f3376",
+              "61bd0d490c0e4c04a331cf9ce6b7d38f",
+              "7d8653fca29f4df3a7487733ff9db60b",
+              "943f8fcb66614353a51f32f8344b6122",
+              "0e695245b97c4bbc85e349fda3dc07b9",
+              "bb0d168c41f540b8ae42239d3938483a",
+              "87700a80125348f28c4f249bdf8b0a8d",
+              "8902c3622da540e496ed5b1524bd01ca",
+              "90432ec1c24b4607a935c94e130cd68d",
+              "464147b149824f20afc727751a702fc7",
+              "67e37a088be64a2ba786ca923b1017dd",
+              "98786f52ef5345b0b9164b9c1f2b8e18",
+              "0e1b9910a77d4b7fa69cb8926e6547d7",
+              "0b276315be4345be83da1e03905c8495",
+              "e11f8c3891284e07bd2572257afd5e1b",
+              "ee18d96394994d01b49d5b03b3d9a019",
+              "844b06df5749441fab6f61656ce581a9",
+              "e1c6b9a20e074f17aeba976b24e80c65",
+              "c690da8daa1e4f9ea73bcacdd92e8a6d",
+              "d0b161ae25c441e8b3caf7a3d88c1b05",
+              "47cf4b6b835d43388576a2abf4cc54f8",
+              "03bbebd659e64b5d9c29a73570c34854",
+              "b68e5097d2504d2cbd7e19aa1aac3a04",
+              "22a665deff88477b9372c0350c4c572b",
+              "5e535ed2b83e496ab57b1c80b615ab0c",
+              "d9de065c7f81443e98ddf066c7b5bd54",
+              "1e836106837c4ac7a11b36e700c46b64",
+              "55591e8179084fcfa3a61c8bd8d09dcb",
+              "de1ef93c41364eda9b4b111231057348",
+              "23b0b2f4f82c4a21846e91d7cea91da5",
+              "9e4d0fbb51284a7487c495c7b95a293d",
+              "b0f8cf1f79e04b5fb47a810f2c81bd7e",
+              "0c359bc4c94c46acbc9094354a15c33d",
+              "59d0b59b6c2248508d0601ff13878d33",
+              "891cb726d45c4fef8f2c74a56df5532b",
+              "fa39189070334939aea5fa4a7de5ec8b",
+              "f0e107dd6d54483aa367da0e337a97cd",
+              "861a00796f55470e85d94733eeee9a5f",
+              "5459633eb6e94ec391d13fcf67425726",
+              "b7b7467ece304ffbbd352b9b96a03aad",
+              "9dece059f1204e29b106fca9e191ddb3",
+              "e2e49c25d6fc4592b317e94cfabc2e5e",
+              "76d37a48a73946bab2821f097cf2605f",
+              "8e81ae00681347cb906b392c3656a64a",
+              "74bedc38b7da4e8a83b0c892d7aa59b5",
+              "d1e67c28b4664e8098dce8f5e80b8779",
+              "abe6cf39b784436993fcbe92221c31a3",
+              "d021a18ab70b4c7e8aec43932a124c36",
+              "72e7c092fb054b7ea0dcd2782b5d8a7d",
+              "8b1ea80221174fae943d5c9f997dfb57",
+              "f8073d625f80415dbf712cee434f6e3a",
+              "5f6014ba13fa4a659b9eb1b5f83599a7",
+              "327ff8f5292d47afbfebd3beea187739",
+              "988cac4341b646079fc73719f3f88ad7",
+              "900a4dac08f540dfb35c29f63236a12c",
+              "1e6009b9b0684b8fbaa379ea96f111ee",
+              "541b9b4e74614e2cb855bb90f03df538",
+              "ff256b2275f740ed82bca4f43b4d6fd2",
+              "3703041a499c426bb427ee008c81cde5",
+              "4b22bbacb995425fb32a2368f3685a92",
+              "49a66eeb9ef74de5ab8904fd90eb7558",
+              "08f9d125018b41c582a0fa1e234315f9",
+              "736c770230644894b85dbc34bd8f1d52",
+              "b67cbbf32f844a19b219be612d5038c9",
+              "774b513d64524ac7823a2cf13efa8d41",
+              "1e56da93bcf64ff490416d2b66cd3dc0",
+              "b7e35038ce344110b785753b655130f5",
+              "5472af91737446f4a4a2d92a3f684a45",
+              "9fb4368802da4a5a8101ba200d98403a",
+              "2e713bcc372e48b2a006558db4d1df68",
+              "1a277abd5ea44253bc6894bef258b52b",
+              "b3eedd82e7da4ce8b3ded70e49a2afd0",
+              "6f5c18cb8002471f8b3764effee37324",
+              "3bebac362b344e8d9103c5011613f1ea",
+              "670905a55b19458da69f83c8bcd511d1",
+              "ff54451a48394faaaa9d8cdb690d0718",
+              "36b5bc19b2d0407f8ab28ff0da2ce12d",
+              "879e48d9a9e04183903d94ffe98313d2",
+              "abce503d70594c2ca9afdc47847c125b",
+              "028e291ee53947bbbbc4bfb68c695f5f",
+              "a530662719374c95a9bef12e59e28c85",
+              "bffc0f4b12f141398535990709fd4f2c",
+              "04804c74e1dd43449d5f758cf5d0ba5e",
+              "95a506c3007c4525b01ee4e1600d671b",
+              "a0d6b0caeb2340fe96c8f5569e3d3ae4",
+              "30798f87a8b848d783fdacd71af5dc04",
+              "07ce54c75e76488ba4019a20b3707061",
+              "f023175de68445f98a6b01bb40ccdc6d",
+              "7389b79a0ff44cd68c7866995d728023",
+              "8e2b70ffe4eb4974bd6393fcc1292267",
+              "13eee164dc534424acb9dc9ee37a9465",
+              "722a7fe16af3422585a20c651345cfa4",
+              "f5596c1c9c4d42f3bc171961f9582eff",
+              "85d66e615b5742e78657b1e60c75fc72",
+              "731c02dc5dd446c3b22765575148e256",
+              "254ce460ce244c99a5afe39d5d51f6b7",
+              "4cf1dc345ace4da59f978f661487f975",
+              "8f30fca71bf24e5ca26e17c2321f893c",
+              "dd85d37dd1d14c7ea4592f8e11b2d2c8",
+              "3cb06377e4454f009d6b2aa7aa6ff0a9",
+              "4502477db4d948e693012364c2dcb370",
+              "52fe404ec9c14db2a7279b4c154eef3d"
+            ]
+          },
+          "collapsed": true,
+          "id": "E1UFuJC570Tk",
+          "outputId": "aebb69d4-c167-4de5-eb8a-dd19dd538f63"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "Not in Google Colab environment\n"
+            ]
+          }
+        ],
+        "source": [
+          "import os\n",
+          "\n",
+          "try:\n",
+          "    from google.colab import userdata\n",
+          "    os.environ['LLAMA_API_KEY'] = userdata.get('LLAMA_API_KEY')\n",
+          "except ImportError:\n",
+          "    print(\"Not in Google Colab environment\")\n",
+          "\n",
+          "for key in ['LLAMA_API_KEY']:\n",
+          "    try:\n",
+          "        api_key = os.environ[key]\n",
+          "        if not api_key:\n",
+          "            raise ValueError(f\"{key} environment variable is empty\")\n",
+          "    except KeyError:\n",
+          "        api_key = input(f\"{key} environment variable is not set. Please enter your API key: \")\n",
+          "        os.environ[key] = api_key\n",
+          "\n",
+          "from llama_stack_client import LlamaStackClient\n",
+          "\n",
+          "client = LlamaStackClient(\n",
+          "    base_url=\"http://0.0.0.0:8321\", \n",
+          "    provider_data = {\n",
+          "        \"llama_api_key\": os.environ['LLAMA_API_KEY']\n",
+          "    }\n",
+          ")"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "635a7a6f",
+        "metadata": {},
+        "source": [
+          "Now that we have completed the setup and configuration, let's start exploring the capabilities of Llama 4!\n",
+          "\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "0fc75d73",
+        "metadata": {},
+        "source": [
+          "## 2. Running Llama 4"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "7dacaa2d-94e9-42e9-82a0-73522dfc7010",
+        "metadata": {
+          "id": "7dacaa2d-94e9-42e9-82a0-73522dfc7010"
+        },
+        "source": [
+          "### 2.1 Check available models\n",
+          "\n",
+          "All the models available are programmatically accessible via the client."
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 13,
+        "id": "ruO9jQna_t_S",
+        "metadata": {
+          "colab": {
+            "base_uri": "https://localhost:8080/"
+          },
+          "collapsed": true,
+          "id": "ruO9jQna_t_S",
+          "outputId": "ab1722a7-62ab-43bb-9cab-4e45bf62068a"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "Available models:\n",
+              "- Llama-3.1-8B-Instruct\n",
+              "- meta-llama/Llama-3.1-8B-Instruct\n",
+              "- Llama-3.2-11B-Vision-Instruct\n",
+              "- meta-llama/Llama-3.2-11B-Vision-Instruct\n",
+              "- Llama-3.3-70B-Instruct\n",
+              "- meta-llama/Llama-3.3-70B-Instruct\n",
+              "- Llama-4-Maverick-17B-128E-Instruct-FP8\n",
+              "- meta-llama/Llama-4-Maverick-17B-128E-Instruct\n",
+              "- all-MiniLM-L6-v2\n"
+            ]
+          }
+        ],
+        "source": [
+          "from rich.pretty import pprint\n",
+          "\n",
+          "print(\"Available models:\")\n",
+          "for m in client.models.list():\n",
+          "    print(f\"- {m.identifier}\")\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "86366383",
+        "metadata": {
+          "id": "86366383"
+        },
+        "source": [
+          "### 2.2 Run a simple chat completion with one of the models\n",
+          "\n",
+          "We will test the client by doing a simple chat completion."
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 14,
+        "id": "77c29dba",
+        "metadata": {
+          "colab": {
+            "base_uri": "https://localhost:8080/"
+          },
+          "id": "77c29dba",
+          "outputId": "4857974f-4c70-4bc4-f90a-6ae49dc9c41e"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "Here is a two-sentence poem about a llama:\n",
+              "\n",
+              "With soft fur and gentle eyes, the llama roams with gentle surprise, a peaceful presence in the Andean skies. Its calm demeanor and soft humming song bring serenity to all who belong.\n"
+            ]
+          }
+        ],
+        "source": [
+          "# TODO: update this with a vision model\n",
+          "model_id = \"meta-llama/Llama-4-Maverick-17B-128E-Instruct\"\n",
+          "\n",
+          "response = client.inference.chat_completion(\n",
+          "    model_id=model_id,\n",
+          "    messages=[\n",
+          "        {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
+          "        {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
+          "    ],\n",
+          ")\n",
+          "\n",
+          "print(response.completion_message.content)\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "7737cd41",
+        "metadata": {},
+        "source": [
+          "### 2.3 Running multimodal inference"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 15,
+        "id": "e7b1baa7",
+        "metadata": {},
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+              "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+              "100  275k  100  275k    0     0   847k      0 --:--:-- --:--:-- --:--:--  845k--:--:-- --:--:--     0\n"
+            ]
+          },
+          {
+            "data": {
+              "image/jpeg": "",
+              "text/plain": [
+                ""
+              ]
+            },
+            "execution_count": 15,
+            "metadata": {
+              "image/jpeg": {
+                "height": 256,
+                "width": 256
+              }
+            },
+            "output_type": "execute_result"
+          }
+        ],
+        "source": [
+          "!curl -O https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg\n",
+          "\n",
+          "from IPython.display import Image\n",
+          "Image(\"Llama_Repo.jpeg\", width=256, height=256)"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 16,
+        "id": "e1450ecc",
+        "metadata": {},
+        "outputs": [],
+        "source": [
+          "import base64\n",
+          "def encode_image(image_path):\n",
+          "    with open(image_path, \"rb\") as image_file:\n",
+          "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+          "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
+          "        return base64_url"
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 18,
+        "id": "d7914894",
+        "metadata": {},
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "The image features three llamas, each with a distinct color. The llama on the left is white, the middle one is purple, and the one on the right is also white but wears a blue party hat.\n",
+              "\n",
+              "To determine the number of different colors present, we can count the unique hues:\n",
+              "\n",
+              "1. White (two llamas)\n",
+              "2. Purple (one llama)\n",
+              "3. Blue (party hat)\n",
+              "\n",
+              "Therefore, there are 3 different colors visible in the image: white, purple, and blue.\n"
+            ]
+          }
+        ],
+        "source": [
+          "response = client.inference.chat_completion(\n",
+          "    messages=[\n",
+          "        {\n",
+          "            \"role\": \"user\",\n",
+          "            \"content\": [\n",
+          "                {\n",
+          "                    \"type\": \"image\",\n",
+          "                    \"image\": {\n",
+          "                        \"url\": {\n",
+          "                            \"uri\": encode_image(\"Llama_Repo.jpeg\")\n",
+          "                        }\n",
+          "                    }\n",
+          "                },\n",
+          "                {\n",
+          "                    \"type\": \"text\",\n",
+          "                    \"text\": \"How many different colors are those llamas? What are those colors?\",\n",
+          "                }\n",
+          "            ]\n",
+          "        }\n",
+          "    ],\n",
+          "    model_id=model_id,\n",
+          "    stream=False,\n",
+          ")\n",
+          "\n",
+          "print(response.completion_message.content)"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "8cf0d555",
+        "metadata": {
+          "id": "8cf0d555"
+        },
+        "source": [
+          "### 2.4 Have a conversation\n",
+          "\n",
+          "Maintaining a conversation history allows the model to retain context from previous interactions. Use a list to accumulate messages, enabling continuity throughout the chat session."
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 19,
+        "id": "3fdf9df6",
+        "metadata": {
+          "id": "3fdf9df6"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "\u001b[36m> Response: The most famous Prime Minister of England during World War 2 was Winston Churchill. He served as the Prime Minister of the United Kingdom from 1940 to 1945, and again from 1951 to 1955. Churchill is widely regarded as one of the greatest wartime leaders in history, known for his leadership, oratory skills, and unwavering resolve during the war.\n",
+              "\n",
+              "Churchill played a crucial role in rallying the British people during the war, and his speeches, such as the \"We shall fight on the beaches\" and \"Their finest hour\" speeches, are still remembered and celebrated today. He worked closely with other Allied leaders, including US President Franklin D. Roosevelt and Soviet leader Joseph Stalin, to coordinate the war effort and ultimately secure the defeat of Nazi Germany.\n",
+              "\n",
+              "Churchill's leadership and legacy have endured long after the war, and he remains one of the most iconic and influential figures in British history.\u001b[0m\n",
+              "\u001b[36m> Response: Winston Churchill was known for his many memorable quotes, but one of his most famous is:\n",
+              "\n",
+              "**\"We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we shall never surrender.\"**\n",
+              "\n",
+              "This quote is from his speech to the House of Commons on June 4, 1940, during the early stages of World War II, when Nazi Germany was threatening to invade Britain. The speech is known as the \"We Shall Fight on the Beaches\" speech, and it's considered one of the greatest speeches of the 20th century.\n",
+              "\n",
+              "However, if I had to pick a single, even more concise quote, it would be:\n",
+              "\n",
+              "**\"Blood, toil, tears, and sweat.\"**\n",
+              "\n",
+              "This was the opening phrase of his first speech as Prime Minister to the House of Commons on May 13, 1940, in which he said:\n",
+              "\n",
+              "\"I say to the House as I said to those who have joined this Government, I have nothing to offer but blood, toil, tears, and sweat. We have before us an ordeal of the most grievous kind.\"\n",
+              "\n",
+              "This quote has become synonymous with Churchill's leadership and resolve during the war.\u001b[0m\n"
+            ]
+          }
+        ],
+        "source": [
+          "from termcolor import cprint\n",
+          "\n",
+          "questions = [\n",
+          "    \"Who was the most famous PM of England during world war 2 ?\",\n",
+          "    \"What was his most famous quote ?\"\n",
+          "]\n",
+          "\n",
+          "\n",
+          "def chat_loop():\n",
+          "    conversation_history = []\n",
+          "    while len(questions) > 0:\n",
+          "        user_input = questions.pop(0)\n",
+          "        if user_input.lower() in [\"exit\", \"quit\", \"bye\"]:\n",
+          "            cprint(\"Ending conversation. Goodbye!\", \"yellow\")\n",
+          "            break\n",
+          "\n",
+          "        user_message = {\"role\": \"user\", \"content\": user_input}\n",
+          "        conversation_history.append(user_message)\n",
+          "\n",
+          "        response = client.inference.chat_completion(\n",
+          "            messages=conversation_history,\n",
+          "            model_id=model_id,\n",
+          "        )\n",
+          "        cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+          "\n",
+          "        assistant_message = {\n",
+          "            \"role\": \"assistant\",  # was user\n",
+          "            \"content\": response.completion_message.content,\n",
+          "            \"stop_reason\": response.completion_message.stop_reason,\n",
+          "        }\n",
+          "        conversation_history.append(assistant_message)\n",
+          "\n",
+          "\n",
+          "chat_loop()\n"
+        ]
+      },
+      {
+        "cell_type": "markdown",
+        "id": "72e5111e",
+        "metadata": {
+          "id": "72e5111e"
+        },
+        "source": [
+          "Here is an example for you to try a conversation yourself.\n",
+          "Remember to type `quit` or `exit` after you are done chatting."
+        ]
+      },
+      {
+        "cell_type": "code",
+        "execution_count": 35,
+        "id": "9496f75c",
+        "metadata": {
+          "colab": {
+            "base_uri": "https://localhost:8080/"
+          },
+          "id": "9496f75c",
+          "outputId": "7d93a4cf-a5d4-4741-b6eb-6bce3a27ff66"
+        },
+        "outputs": [
+          {
+            "name": "stdout",
+            "output_type": "stream",
+            "text": [
+              "\u001b[36m> Response: Hello! How are you today? Is there something I can help you with or would you like to chat?\u001b[0m\n",
+              "\u001b[33mEnding conversation. Goodbye!\u001b[0m\n"
+            ]
+          }
+        ],
+        "source": [
+          "# NBVAL_SKIP\n",
+          "from termcolor import cprint\n",
+          "\n",
+          "def chat_loop():\n",
+          "    conversation_history = []\n",
+          "    while True:\n",
+          "        user_input = input(\"User> \")\n",
+          "        if user_input.lower() in [\"exit\", \"quit\", \"bye\"]:\n",
+          "            cprint(\"Ending conversation. Goodbye!\", \"yellow\")\n",
+          "            break\n",
+          "\n",
+          "        user_message = {\"role\": \"user\", \"content\": user_input}\n",
+          "        conversation_history.append(user_message)\n",
+          "\n",
+          "        response = client.inference.chat_completion(\n",
+          "            messages=conversation_history,\n",
+          "            model_id=model_id,\n",
+          "        )\n",
+          "        cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+          "\n",
+          "        assistant_message = {\n",
+          "            \"role\": \"assistant\",  # was user\n",
+          "            \"content\": response.completion_message.content,\n",
+          "            \"stop_reason\": response.completion_message.stop_reason,\n",
+          "        }\n",
+          "        conversation_history.append(assistant_message)\n",
+          "\n",
+          "\n",
+          "chat_loop()\n"
+        ]
+      }
+    ],
+    "metadata": {
+      "accelerator": "GPU",
+      "colab": {
+        "gpuType": "T4",
+        "provenance": []
+      },
+      "kernelspec": {
+        "display_name": "l4",
+        "language": "python",
+        "name": "python3"
+      },
+      "language_info": {
+        "codemirror_mode": {
+          "name": "ipython",
+          "version": 3
+        },
+        "file_extension": ".py",
+        "mimetype": "text/x-python",
+        "name": "python",
+        "nbconvert_exporter": "python",
+        "pygments_lexer": "ipython3",
+        "version": "3.10.16"
+      }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+  }
diff --git a/docs/make.bat b/docs/make.bat
index 32bb24529..954237b9b 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 5de7f715e..93f78d268 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -38,12 +38,8 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
         "collapsed": true,
-        "id": "O9pGVlPIjpix",
-        "outputId": "e1fbe723-ae31-4630-eb80-4c4f6476d56f"
+        "id": "O9pGVlPIjpix"
       },
       "outputs": [],
       "source": [
@@ -55,12 +51,8 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
         "collapsed": true,
-        "id": "JQpLUSNjlGAM",
-        "outputId": "2f7fec97-5511-4cae-d51e-6d262fbca19c"
+        "id": "JQpLUSNjlGAM"
       },
       "outputs": [],
       "source": [
@@ -70,7 +62,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -337,9 +329,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: {}\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inline::code-interpreter\n",
-              "  - config: {}\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: {}\n",
@@ -378,10 +367,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: []\n",
@@ -617,9 +602,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n",
-              "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
@@ -658,10 +640,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
@@ -715,7 +693,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
       "metadata": {
         "id": "TC_IwIAQo4q-"
       },
@@ -728,116 +706,10 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 305,
-          "referenced_widgets": [
-            "feb82e061ee44283b4a46be858ef4cd7",
-            "78a2d2d4ee3f42f3be42ef4baa298561",
-            "ba5e6ca09f174ef3a348453cf5cfc24a",
-            "74b58e4647644c9daf9af488942fdaf4",
-            "d56e218958a041e286e80f24e400ab0b",
-            "cab80632b7564a9eb59583e09573c1ee",
-            "10c0d50d7c204de0b4c8e8f4d3ec0af5",
-            "626ef2f811ae4e119a0e85cebe92b91d",
-            "aef4172d916f40b0ab4ed09104e10f24",
-            "25529e7fd57049d2816d31f696eab1fd",
-            "093bdcb608cf4b4fa37b0032a3915187",
-            "c788d4e9e1e24dca9b6503689df9b631",
-            "d1587e2144bf46299c1bdec3ea96e4e7",
-            "500a072c09da41759cb2c942a16d8429",
-            "9785009392934e3bbb229e8781667cbc",
-            "84570fe2c2a54a068fb9b8cbc8b041a1",
-            "f9e579c58e3f4ae0bbb721dffa33bf0a",
-            "737116977f474ec0b68d88a40fd1086c",
-            "e6d6e516cd03452297d80c36376855dd",
-            "6ae0fadb3aeb4be18a9ab3279fb23145",
-            "fa4800a506ac480984d58933580df086",
-            "117468099dbc42fdaafc08207eaac7ab",
-            "44f585990aa244d8ba61f892dc1ccc1c",
-            "4fc59928a0544f95a4438b37d19ca437",
-            "fb644d47049f495397d0e60597c86ea3",
-            "78632694ff694442bc3fefc2cac2cbf5",
-            "083fd2549abd4b03bd41d8b92ec28f42",
-            "611d6472a58d419583acc416767a4c90",
-            "98c5ce434cff454eaaa3f0fd3498183a",
-            "3d0344a9cc744e369da1b6b7ea1b3be8",
-            "c452ccbf47a44073aee710175f707a7d",
-            "0218397c573e4b28bfb4ffa66464d50f",
-            "9b01bcd6e5174be2af19f457047017c8",
-            "4fed5720f30b4b3cbbc606a4f25e223b",
-            "6fa866b9971542739b0ed26d90ceac80",
-            "fe7553b513954cc68c427b5d9d260b33",
-            "4bc266d49a6741a88350e029d101425b",
-            "da57445f98e7427589962836c2b4287e",
-            "ad1fb86cc1f94fd9911eda03cf4a3783",
-            "fdefb51ad4c4418b98c5826126558011",
-            "179d41b80dc841e8a440482516b8bca5",
-            "22b1ecd2eff14770bcfb0c62d3d4213f",
-            "47f876cf41484d55b645e1e99337423a",
-            "340fbbb4982c460992c88885e79b47db",
-            "9659140487ca4d3ea799196d2c1ecf61",
-            "52150fd494d24eea89b5232077509355",
-            "04acde771d0a46699e1de07d9733d1a3",
-            "7b98103300814f3caea84266263b95a2",
-            "75f06408071c494f934bb909b84110d1",
-            "b09b2690894749339a9172e5ad0a9b75",
-            "cbed38801163438d891879b756f5baab",
-            "399a6417b23e4593bb244ec3abb6b46d",
-            "53a321f36b0d4e08a74a5bcfbd04434b",
-            "b8c0c8aaac0d4032bf5c673a43d084ab",
-            "d1f32499fa3f4795b92361637e23a9bb",
-            "c06f9a090fb54c74b947634bf6d11fa8",
-            "82991dcc80f14af9bd2e95f705980676",
-            "cd832e3842b945aabbb327856053f261",
-            "93ee645d54f34acdb0d15092d4a6f0d1",
-            "b77fe05bbcf84cdc8ef85b264ccd35f6",
-            "e17d286a965a49cfb8d5bf885865cb1e",
-            "ca015c1a0c1449e68edb282462435a3f",
-            "2932b06afde9468a976eb6bfb072b80e",
-            "d027c807ddc04f89bec41dc05fde7718",
-            "4ff3a6aaf706460bbba01b248b93000e",
-            "bfd75a39f0154c30adbaad1e2ca0f1e2",
-            "4f788a7920c346f3b42900825bd6711a",
-            "8e9358ec7d474808bb96c13e13489c67",
-            "f0dfeee2a8d64dedbc8ef55ad4e69932",
-            "9437b707bf1a4847a50aafeb4252dab5",
-            "f255707788704a76bd1651f26a22402d",
-            "3b70fa4e43ef4951862e119378c3c501",
-            "6c0a6a7fa8ca4e1c961a36305f0e7638",
-            "201bd914f9884e46b8e6df9d9900a6e8",
-            "f53b7ada01084e73bba6e14a95e2a534",
-            "d2029292327b488db02fd123ee2b75af",
-            "3e26bc24a3e44b4582f57913bdf98de4",
-            "9d2b6eabf7e14436b72bbf374b4a2a0a",
-            "b5d7cb5a6157449a850ef0e12e3d3eb7",
-            "c245d316bf9e44dabe5bfd1e47fc8d2e",
-            "963cf422ca894d82b0dd94c6165d41bf",
-            "78d0e2aa93674bbeb42bff87a23cce9b",
-            "12c6f1180eeb4e9eb9037ea5dd24ec8e",
-            "017a81d7160240a398947545963856f5",
-            "1cf8eeb8d81c4e8a8e95dd43296a78b9",
-            "5b0b5a3f79e94c51aae48fe0dd34ba0e",
-            "f5b34a743ce54fb591f25b04a2651d65",
-            "dec6399e2c5341aead66e1674d3e6c72",
-            "24e48376a72940679989a39a40bbe7f6",
-            "484df732051540859bc7ac9cecadc83c",
-            "4b33b1db50c34a2fa957d81a71a2a47f",
-            "e51d501e2f994baba40345ad632eabee",
-            "631a85e420b64e8cb6915af59c5ce08a",
-            "70af9cb2838c4a92bd67f8cb5c98d97f",
-            "158115266c284c4f8dbce3586151cbf1",
-            "ce5019b36cde44c58c5f596dbb59a2f8",
-            "b90d660ca8584ba1815a3c66b420c079",
-            "7c4d1de626784a59a7e0a33c24086186",
-            "21cf0e35ecd845a8b5e7c5ce241cf177"
-          ]
-        },
         "collapsed": true,
-        "id": "DJkmoG2kq1_P",
-        "outputId": "8493ee59-c6ff-4bb6-d787-f295944db1cf"
+        "id": "DJkmoG2kq1_P"
       },
       "outputs": [],
       "source": [
@@ -862,7 +734,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -963,7 +835,7 @@
         "\n",
         "client.benchmarks.register(\n",
         "    benchmark_id=\"meta-reference::mmmu\",\n",
-        "    # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
+        "    # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the\n",
         "    # `input_rows` argument and does not fetch data from the dataset.\n",
         "    dataset_id=f\"mmmu-{subset}-{split}\",\n",
         "    # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
@@ -1008,7 +880,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": null,
       "metadata": {
         "id": "HXmZf3Ymw-aX"
       },
@@ -1028,7 +900,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": null,
       "metadata": {
         "id": "Gc8azb4Rxr5J"
       },
@@ -1042,7 +914,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -1182,7 +1054,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 27,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -1307,7 +1179,9 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "metadata": {},
+      "metadata": {
+        "id": "lxc9-eXYK5Av"
+      },
       "outputs": [],
       "source": []
     }
@@ -1336,3088 +1210,6 @@
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.10.16"
-    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "017a81d7160240a398947545963856f5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "0218397c573e4b28bfb4ffa66464d50f": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "04acde771d0a46699e1de07d9733d1a3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_399a6417b23e4593bb244ec3abb6b46d",
-            "max": 453677660,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_53a321f36b0d4e08a74a5bcfbd04434b",
-            "value": 453677660
-          }
-        },
-        "083fd2549abd4b03bd41d8b92ec28f42": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "093bdcb608cf4b4fa37b0032a3915187": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "10c0d50d7c204de0b4c8e8f4d3ec0af5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "117468099dbc42fdaafc08207eaac7ab": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "12c6f1180eeb4e9eb9037ea5dd24ec8e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "158115266c284c4f8dbce3586151cbf1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "179d41b80dc841e8a440482516b8bca5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "1cf8eeb8d81c4e8a8e95dd43296a78b9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "201bd914f9884e46b8e6df9d9900a6e8": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "21cf0e35ecd845a8b5e7c5ce241cf177": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "22b1ecd2eff14770bcfb0c62d3d4213f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "24e48376a72940679989a39a40bbe7f6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_484df732051540859bc7ac9cecadc83c",
-              "IPY_MODEL_4b33b1db50c34a2fa957d81a71a2a47f",
-              "IPY_MODEL_e51d501e2f994baba40345ad632eabee"
-            ],
-            "layout": "IPY_MODEL_631a85e420b64e8cb6915af59c5ce08a"
-          }
-        },
-        "25529e7fd57049d2816d31f696eab1fd": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2932b06afde9468a976eb6bfb072b80e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "340fbbb4982c460992c88885e79b47db": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "399a6417b23e4593bb244ec3abb6b46d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3b70fa4e43ef4951862e119378c3c501": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3d0344a9cc744e369da1b6b7ea1b3be8": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3e26bc24a3e44b4582f57913bdf98de4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "44f585990aa244d8ba61f892dc1ccc1c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_4fc59928a0544f95a4438b37d19ca437",
-              "IPY_MODEL_fb644d47049f495397d0e60597c86ea3",
-              "IPY_MODEL_78632694ff694442bc3fefc2cac2cbf5"
-            ],
-            "layout": "IPY_MODEL_083fd2549abd4b03bd41d8b92ec28f42"
-          }
-        },
-        "47f876cf41484d55b645e1e99337423a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "484df732051540859bc7ac9cecadc83c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_70af9cb2838c4a92bd67f8cb5c98d97f",
-            "placeholder": "​",
-            "style": "IPY_MODEL_158115266c284c4f8dbce3586151cbf1",
-            "value": "Generating test split: 100%"
-          }
-        },
-        "4b33b1db50c34a2fa957d81a71a2a47f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ce5019b36cde44c58c5f596dbb59a2f8",
-            "max": 287,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_b90d660ca8584ba1815a3c66b420c079",
-            "value": 287
-          }
-        },
-        "4bc266d49a6741a88350e029d101425b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_47f876cf41484d55b645e1e99337423a",
-            "placeholder": "​",
-            "style": "IPY_MODEL_340fbbb4982c460992c88885e79b47db",
-            "value": " 461M/461M [00:11<00:00, 31.2MB/s]"
-          }
-        },
-        "4f788a7920c346f3b42900825bd6711a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_8e9358ec7d474808bb96c13e13489c67",
-              "IPY_MODEL_f0dfeee2a8d64dedbc8ef55ad4e69932",
-              "IPY_MODEL_9437b707bf1a4847a50aafeb4252dab5"
-            ],
-            "layout": "IPY_MODEL_f255707788704a76bd1651f26a22402d"
-          }
-        },
-        "4fc59928a0544f95a4438b37d19ca437": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_611d6472a58d419583acc416767a4c90",
-            "placeholder": "​",
-            "style": "IPY_MODEL_98c5ce434cff454eaaa3f0fd3498183a",
-            "value": "validation-00000-of-00001.parquet: 100%"
-          }
-        },
-        "4fed5720f30b4b3cbbc606a4f25e223b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_6fa866b9971542739b0ed26d90ceac80",
-              "IPY_MODEL_fe7553b513954cc68c427b5d9d260b33",
-              "IPY_MODEL_4bc266d49a6741a88350e029d101425b"
-            ],
-            "layout": "IPY_MODEL_da57445f98e7427589962836c2b4287e"
-          }
-        },
-        "4ff3a6aaf706460bbba01b248b93000e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "500a072c09da41759cb2c942a16d8429": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e6d6e516cd03452297d80c36376855dd",
-            "max": 29453850,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_6ae0fadb3aeb4be18a9ab3279fb23145",
-            "value": 29453850
-          }
-        },
-        "52150fd494d24eea89b5232077509355": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b09b2690894749339a9172e5ad0a9b75",
-            "placeholder": "​",
-            "style": "IPY_MODEL_cbed38801163438d891879b756f5baab",
-            "value": "test-00001-of-00003.parquet: 100%"
-          }
-        },
-        "53a321f36b0d4e08a74a5bcfbd04434b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "5b0b5a3f79e94c51aae48fe0dd34ba0e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "611d6472a58d419583acc416767a4c90": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "626ef2f811ae4e119a0e85cebe92b91d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "631a85e420b64e8cb6915af59c5ce08a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "6ae0fadb3aeb4be18a9ab3279fb23145": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "6c0a6a7fa8ca4e1c961a36305f0e7638": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "6fa866b9971542739b0ed26d90ceac80": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ad1fb86cc1f94fd9911eda03cf4a3783",
-            "placeholder": "​",
-            "style": "IPY_MODEL_fdefb51ad4c4418b98c5826126558011",
-            "value": "test-00000-of-00003.parquet: 100%"
-          }
-        },
-        "70af9cb2838c4a92bd67f8cb5c98d97f": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "737116977f474ec0b68d88a40fd1086c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "74b58e4647644c9daf9af488942fdaf4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_25529e7fd57049d2816d31f696eab1fd",
-            "placeholder": "​",
-            "style": "IPY_MODEL_093bdcb608cf4b4fa37b0032a3915187",
-            "value": " 36.0k/36.0k [00:00<00:00, 1.29MB/s]"
-          }
-        },
-        "75f06408071c494f934bb909b84110d1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "78632694ff694442bc3fefc2cac2cbf5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_0218397c573e4b28bfb4ffa66464d50f",
-            "placeholder": "​",
-            "style": "IPY_MODEL_9b01bcd6e5174be2af19f457047017c8",
-            "value": " 165M/165M [00:03<00:00, 42.9MB/s]"
-          }
-        },
-        "78a2d2d4ee3f42f3be42ef4baa298561": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_cab80632b7564a9eb59583e09573c1ee",
-            "placeholder": "​",
-            "style": "IPY_MODEL_10c0d50d7c204de0b4c8e8f4d3ec0af5",
-            "value": "README.md: 100%"
-          }
-        },
-        "78d0e2aa93674bbeb42bff87a23cce9b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "7b98103300814f3caea84266263b95a2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b8c0c8aaac0d4032bf5c673a43d084ab",
-            "placeholder": "​",
-            "style": "IPY_MODEL_d1f32499fa3f4795b92361637e23a9bb",
-            "value": " 454M/454M [00:11<00:00, 40.4MB/s]"
-          }
-        },
-        "7c4d1de626784a59a7e0a33c24086186": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "82991dcc80f14af9bd2e95f705980676": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e17d286a965a49cfb8d5bf885865cb1e",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ca015c1a0c1449e68edb282462435a3f",
-            "value": "test-00002-of-00003.parquet: 100%"
-          }
-        },
-        "84570fe2c2a54a068fb9b8cbc8b041a1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "8e9358ec7d474808bb96c13e13489c67": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_3b70fa4e43ef4951862e119378c3c501",
-            "placeholder": "​",
-            "style": "IPY_MODEL_6c0a6a7fa8ca4e1c961a36305f0e7638",
-            "value": "Generating dev split: 100%"
-          }
-        },
-        "93ee645d54f34acdb0d15092d4a6f0d1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_4ff3a6aaf706460bbba01b248b93000e",
-            "placeholder": "​",
-            "style": "IPY_MODEL_bfd75a39f0154c30adbaad1e2ca0f1e2",
-            "value": " 471M/471M [00:11<00:00, 41.5MB/s]"
-          }
-        },
-        "9437b707bf1a4847a50aafeb4252dab5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d2029292327b488db02fd123ee2b75af",
-            "placeholder": "​",
-            "style": "IPY_MODEL_3e26bc24a3e44b4582f57913bdf98de4",
-            "value": " 5/5 [00:00<00:00,  8.03 examples/s]"
-          }
-        },
-        "963cf422ca894d82b0dd94c6165d41bf": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f5b34a743ce54fb591f25b04a2651d65",
-            "placeholder": "​",
-            "style": "IPY_MODEL_dec6399e2c5341aead66e1674d3e6c72",
-            "value": " 30/30 [00:03<00:00,  8.23 examples/s]"
-          }
-        },
-        "9659140487ca4d3ea799196d2c1ecf61": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_52150fd494d24eea89b5232077509355",
-              "IPY_MODEL_04acde771d0a46699e1de07d9733d1a3",
-              "IPY_MODEL_7b98103300814f3caea84266263b95a2"
-            ],
-            "layout": "IPY_MODEL_75f06408071c494f934bb909b84110d1"
-          }
-        },
-        "9785009392934e3bbb229e8781667cbc": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_fa4800a506ac480984d58933580df086",
-            "placeholder": "​",
-            "style": "IPY_MODEL_117468099dbc42fdaafc08207eaac7ab",
-            "value": " 29.5M/29.5M [00:00<00:00, 36.5MB/s]"
-          }
-        },
-        "98c5ce434cff454eaaa3f0fd3498183a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9b01bcd6e5174be2af19f457047017c8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9d2b6eabf7e14436b72bbf374b4a2a0a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_b5d7cb5a6157449a850ef0e12e3d3eb7",
-              "IPY_MODEL_c245d316bf9e44dabe5bfd1e47fc8d2e",
-              "IPY_MODEL_963cf422ca894d82b0dd94c6165d41bf"
-            ],
-            "layout": "IPY_MODEL_78d0e2aa93674bbeb42bff87a23cce9b"
-          }
-        },
-        "ad1fb86cc1f94fd9911eda03cf4a3783": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "aef4172d916f40b0ab4ed09104e10f24": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "b09b2690894749339a9172e5ad0a9b75": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b5d7cb5a6157449a850ef0e12e3d3eb7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_12c6f1180eeb4e9eb9037ea5dd24ec8e",
-            "placeholder": "​",
-            "style": "IPY_MODEL_017a81d7160240a398947545963856f5",
-            "value": "Generating validation split: 100%"
-          }
-        },
-        "b77fe05bbcf84cdc8ef85b264ccd35f6": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b8c0c8aaac0d4032bf5c673a43d084ab": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b90d660ca8584ba1815a3c66b420c079": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "ba5e6ca09f174ef3a348453cf5cfc24a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_626ef2f811ae4e119a0e85cebe92b91d",
-            "max": 36030,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_aef4172d916f40b0ab4ed09104e10f24",
-            "value": 36030
-          }
-        },
-        "bfd75a39f0154c30adbaad1e2ca0f1e2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "c06f9a090fb54c74b947634bf6d11fa8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_82991dcc80f14af9bd2e95f705980676",
-              "IPY_MODEL_cd832e3842b945aabbb327856053f261",
-              "IPY_MODEL_93ee645d54f34acdb0d15092d4a6f0d1"
-            ],
-            "layout": "IPY_MODEL_b77fe05bbcf84cdc8ef85b264ccd35f6"
-          }
-        },
-        "c245d316bf9e44dabe5bfd1e47fc8d2e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_1cf8eeb8d81c4e8a8e95dd43296a78b9",
-            "max": 30,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_5b0b5a3f79e94c51aae48fe0dd34ba0e",
-            "value": 30
-          }
-        },
-        "c452ccbf47a44073aee710175f707a7d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "c788d4e9e1e24dca9b6503689df9b631": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_d1587e2144bf46299c1bdec3ea96e4e7",
-              "IPY_MODEL_500a072c09da41759cb2c942a16d8429",
-              "IPY_MODEL_9785009392934e3bbb229e8781667cbc"
-            ],
-            "layout": "IPY_MODEL_84570fe2c2a54a068fb9b8cbc8b041a1"
-          }
-        },
-        "ca015c1a0c1449e68edb282462435a3f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "cab80632b7564a9eb59583e09573c1ee": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "cbed38801163438d891879b756f5baab": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "cd832e3842b945aabbb327856053f261": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_2932b06afde9468a976eb6bfb072b80e",
-            "max": 470745176,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_d027c807ddc04f89bec41dc05fde7718",
-            "value": 470745176
-          }
-        },
-        "ce5019b36cde44c58c5f596dbb59a2f8": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d027c807ddc04f89bec41dc05fde7718": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "d1587e2144bf46299c1bdec3ea96e4e7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f9e579c58e3f4ae0bbb721dffa33bf0a",
-            "placeholder": "​",
-            "style": "IPY_MODEL_737116977f474ec0b68d88a40fd1086c",
-            "value": "dev-00000-of-00001.parquet: 100%"
-          }
-        },
-        "d1f32499fa3f4795b92361637e23a9bb": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "d2029292327b488db02fd123ee2b75af": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d56e218958a041e286e80f24e400ab0b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "da57445f98e7427589962836c2b4287e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "dec6399e2c5341aead66e1674d3e6c72": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "e17d286a965a49cfb8d5bf885865cb1e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e51d501e2f994baba40345ad632eabee": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7c4d1de626784a59a7e0a33c24086186",
-            "placeholder": "​",
-            "style": "IPY_MODEL_21cf0e35ecd845a8b5e7c5ce241cf177",
-            "value": " 287/287 [00:23<00:00, 12.48 examples/s]"
-          }
-        },
-        "e6d6e516cd03452297d80c36376855dd": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f0dfeee2a8d64dedbc8ef55ad4e69932": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_201bd914f9884e46b8e6df9d9900a6e8",
-            "max": 5,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_f53b7ada01084e73bba6e14a95e2a534",
-            "value": 5
-          }
-        },
-        "f255707788704a76bd1651f26a22402d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f53b7ada01084e73bba6e14a95e2a534": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "f5b34a743ce54fb591f25b04a2651d65": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f9e579c58e3f4ae0bbb721dffa33bf0a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fa4800a506ac480984d58933580df086": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fb644d47049f495397d0e60597c86ea3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_3d0344a9cc744e369da1b6b7ea1b3be8",
-            "max": 165333397,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_c452ccbf47a44073aee710175f707a7d",
-            "value": 165333397
-          }
-        },
-        "fdefb51ad4c4418b98c5826126558011": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "fe7553b513954cc68c427b5d9d260b33": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_179d41b80dc841e8a440482516b8bca5",
-            "max": 461411018,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_22b1ecd2eff14770bcfb0c62d3d4213f",
-            "value": 461411018
-          }
-        },
-        "feb82e061ee44283b4a46be858ef4cd7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_78a2d2d4ee3f42f3be42ef4baa298561",
-              "IPY_MODEL_ba5e6ca09f174ef3a348453cf5cfc24a",
-              "IPY_MODEL_74b58e4647644c9daf9af488942fdaf4"
-            ],
-            "layout": "IPY_MODEL_d56e218958a041e286e80f24e400ab0b"
-          }
-        }
-      }
     }
   },
   "nbformat": 4,
diff --git a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
index 399a3bff1..e70cc3bbe 100644
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@@ -840,7 +840,6 @@
     "    \"memory_optimizations.rst\",\n",
     "    \"chat.rst\",\n",
     "    \"llama3.rst\",\n",
-    "    \"datasets.rst\",\n",
     "    \"qat_finetune.rst\",\n",
     "    \"lora_finetune.rst\",\n",
     "]\n",
@@ -1586,7 +1585,6 @@
     "    \"memory_optimizations.rst\",\n",
     "    \"chat.rst\",\n",
     "    \"llama3.rst\",\n",
-    "    \"datasets.rst\",\n",
     "    \"qat_finetune.rst\",\n",
     "    \"lora_finetune.rst\",\n",
     "]\n",
diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index caa4f17ff..9fc375175 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -44,7 +44,7 @@ def main(output_dir: str):
     if return_type_errors:
         print("\nAPI Method Return Type Validation Errors:\n")
         for error in return_type_errors:
-            print(error)
+            print(error, file=sys.stderr)
         sys.exit(1)
     now = str(datetime.now())
     print(
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 3936bb3c4..5b7a685c1 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -6,6 +6,7 @@
 
 import hashlib
 import ipaddress
+import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
@@ -179,7 +180,7 @@ class ContentBuilder:
         "Creates the content subtree for a request or response."
 
         def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
 
         def get_media_type(t):
             if is_generic_list(t):
@@ -189,7 +190,7 @@ class ContentBuilder:
             else:
                 return "application/json"
 
-        if typing.get_origin(payload_type) is typing.Union:
+        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
             media_types = []
             item_types = []
             for x in typing.get_args(payload_type):
@@ -758,7 +759,7 @@ class Generator:
         )
 
         return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
             summary=None,
             # summary=doc_string.short_description,
             description=description,
@@ -804,6 +805,8 @@ class Generator:
         operation_tags: List[Tag] = []
         for cls in endpoint_classes:
             doc_string = parse_type(cls)
+            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
+                continue
             operation_tags.append(
                 Tag(
                     name=cls.__name__,
diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py
index db18e8430..12a69050c 100644
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@@ -174,14 +174,64 @@ def _validate_list_parameters_contain_data(method) -> str | None:
         return "does not have a mandatory data attribute containing the list of objects"
 
 
+def _validate_has_ellipsis(method) -> str | None:
+    source = inspect.getsource(method)
+    if "..." not in source and not "NotImplementedError" in source:
+        return "does not contain ellipsis (...) in its implementation"
+
+def _validate_has_return_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is not None and return_type != type(None) and ":returns:" not in source:
+        return "does not have a ':returns:' in its docstring"
+
+def _validate_has_params_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    sig = inspect.signature(method)
+    # Only check if the method has more than one parameter
+    if len(sig.parameters) > 1 and ":param" not in source:
+        return "does not have a ':param' in its docstring"
+
+def _validate_has_no_return_none_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is None and ":returns: None" in source:
+        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
+
+def _validate_docstring_lines_end_with_dot(method) -> str | None:
+    docstring = inspect.getdoc(method)
+    if docstring is None:
+        return None
+
+    lines = docstring.split('\n')
+    for line in lines:
+        line = line.strip()
+        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
+            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
+
 _VALIDATORS = {
     "GET": [
         _validate_api_method_return_type,
         _validate_list_parameters_contain_data,
         _validate_api_method_doesnt_return_list,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_docstring_lines_end_with_dot,
     ],
     "DELETE": [
         _validate_api_delete_method_returns_none,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring
+    ],
+    "POST": [
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring,
+        _validate_docstring_lines_end_with_dot,
     ],
 }
 
diff --git a/docs/readme.md b/docs/readme.md
index b88a4738d..c238c4720 100644
--- a/docs/readme.md
+++ b/docs/readme.md
@@ -3,10 +3,10 @@
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
 
 ## Render locally
+
+From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
-pip install -r requirements.txt
-cd docs
-python -m sphinx_autobuild source _build
+uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```
 You can open up the docs in your browser at http://localhost:8000
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index e31d08ff1..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-sphinx==8.1.3
-myst-parser
-linkify
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinx-rtd-theme>=1.0.0
-sphinx_autobuild
-sphinx-copybutton
-sphinx-design
-sphinx-pdj-theme
-sphinx_rtd_dark_mode
-sphinx-tabs
-sphinxcontrib-openapi
-sphinxcontrib-redoc
-sphinxcontrib-mermaid
-sphinxcontrib-video
-tomli
diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index db6303209..289c38991 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -51,11 +51,37 @@ chunks = [
         "mime_type": "text/plain",
         "metadata": {
             "document_id": "doc1",
+            "author": "Jane Doe",
         },
     },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
+
+#### Using Precomputed Embeddings
+If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
+including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
+want to customize the ingestion process.
+```python
+chunks_with_embeddings = [
+    {
+        "content": "First chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "introduction"},
+    },
+    {
+        "content": "Second chunk of text",
+        "mime_type": "text/plain",
+        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
+        "metadata": {"document_id": "doc1", "section": "methodology"},
+    },
+]
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
+```
+When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
+registering the vector database.
+
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
@@ -98,6 +124,17 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```
 
+You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
+```python
+# Query documents
+results = client.tool_runtime.rag_tool.query(
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
+    query_config={
+        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+    },
+)
+```
 ### Building RAG-Enhanced Agents
 
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
@@ -115,6 +152,12 @@ agent = Agent(
             "name": "builtin::rag/knowledge_search",
             "args": {
                 "vector_db_ids": [vector_db_id],
+                # Defaults
+                "query_config": {
+                    "chunk_size_in_tokens": 512,
+                    "chunk_overlap_in_tokens": 0,
+                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+                },
             },
         }
     ],
diff --git a/docs/source/building_applications/tools.md b/docs/source/building_applications/tools.md
index 6da1c5a6a..c7af17bfa 100644
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@@ -43,27 +43,6 @@ The tool requires an API key which can be provided either in the configuration o
 
 > **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
 
-#### Code Interpreter
-
-The Code Interpreter allows execution of Python code within a controlled environment.
-
-```python
-# Register Code Interpreter tool group
-client.toolgroups.register(
-    toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
-)
-```
-
-Features:
-- Secure execution environment using `bwrap` sandboxing
-- Matplotlib support for generating plots
-- Disabled dangerous system operations
-- Configurable execution timeouts
-
-> ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
-> - The container requires privileged access (e.g., --privileged).
-> - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
-> - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
 
 #### WolframAlpha
 
@@ -102,7 +81,7 @@ Features:
 - Context retrieval with token limits
 
 
-> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and rag, that are provided by tavily-search, code-interpreter and rag providers.
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 
 ## Model Context Protocol (MCP) Tools
 
@@ -186,34 +165,6 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
 
-## Simple Example: Using an Agent with the Code-Interpreter Tool
-
-```python
-from llama_stack_client import Agent
-
-# Instantiate the AI agent with the given configuration
-agent = Agent(
-    client,
-    name="code-interpreter",
-    description="A code interpreter agent for executing Python code snippets",
-    instructions="""
-    You are a highly reliable, concise, and precise assistant.
-    Always show the generated code, never generate your own code, and never anticipate results.
-    """,
-    model="meta-llama/Llama-3.2-3B-Instruct",
-    tools=["builtin::code_interpreter"],
-    max_infer_iters=5,
-)
-
-# Start a session
-session_id = agent.create_session("tool_session")
-
-# Send a query to the AI agent for code execution
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
-    session_id=session_id,
-)
-```
 ## Simple Example 2: Using an Agent with the Web Search Tool
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 55c6383b2..6e59dbdfb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,7 +22,11 @@ from docutils import nodes
 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
     pypi_url = "https://pypi.org/pypi/llama-stack/json"
-    version_tag = json.loads(requests.get(pypi_url).text)["info"]["version"]
+    headers = {
+        'User-Agent': 'pip/23.0.1 (python 3.11)',  # Mimic pip's user agent
+        'Accept': 'application/json'
+    }
+    version_tag = json.loads(requests.get(pypi_url, headers=headers).text)["info"]["version"]
     print(f"{version_tag=}")
 
     # generate the full link including text and url here
@@ -53,14 +57,6 @@ myst_enable_extensions = ["colon_fence"]
 
 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
-
-# html_theme = "sphinx_pdj_theme"
-# html_theme_path = [sphinx_pdj_theme.get_html_theme_path()]
-
-# html_theme = "pytorch_sphinx_theme"
-# html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
-
-
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
@@ -110,6 +106,8 @@ html_theme_options = {
     "canonical_url": "https://github.com/meta-llama/llama-stack",
     "collapse_navigation": False,
     # "style_nav_header_background": "#c3c9d4",
+    'display_version': True,
+    'version_selector': True,
 }
 
 default_dark_mode = False
diff --git a/docs/source/contributing/new_api_provider.md b/docs/source/contributing/new_api_provider.md
index c412a350b..83058896a 100644
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
-- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 
 
 Here are some example PRs to help you get started:
@@ -33,6 +33,7 @@ Note that each provider's `sample_run_config()` method (in the configuration cla
 
 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
 
+Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
 
 ### 3. Additional end-to-end testing
 
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 56b8d30a8..0dbabf8aa 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -178,7 +178,7 @@ image_name: ollama
 image_type: conda
 
 # If some providers are external, you can specify the path to the implementation
-external_providers_dir: /etc/llama-stack/providers.d
+external_providers_dir: ~/.llama/providers.d
 ```
 
 ```
@@ -206,7 +206,7 @@ distribution_spec:
 image_type: container
 image_name: ci-test
 # Path to external provider implementations
-external_providers_dir: /etc/llama-stack/providers.d
+external_providers_dir: ~/.llama/providers.d
 ```
 
 Here's an example for a custom Ollama provider:
@@ -271,7 +271,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 
 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                        [--image-type {conda,container,venv}]
                        config
 
@@ -285,7 +285,6 @@ options:
   --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
   --image-name IMAGE_NAME
                         Name of the image to run. Defaults to the current environment (default: None)
-  --disable-ipv6        Disable IPv6 support (default: False)
   --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
   --tls-keyfile TLS_KEYFILE
                         Path to TLS key file for HTTPS (default: None)
@@ -339,6 +338,48 @@ INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
+### Listing Distributions
+Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
+
+```
+llama stack list -h
+usage: llama stack list [-h]
+
+list the build stacks
+
+options:
+  -h, --help  show this help message and exit
+```
+
+Example Usage
+
+```
+llama stack list
+```
+
+### Removing a Distribution
+Use the remove command to delete a distribution you've previously built.
+
+```
+llama stack rm -h
+usage: llama stack rm [-h] [--all] [name]
+
+Remove the build stack
+
+positional arguments:
+  name        Name of the stack to delete (default: None)
+
+options:
+  -h, --help  show this help message and exit
+  --all, -a   Delete all stacks (use with caution) (default: False)
+```
+
+Example
+```
+llama stack rm llamastack-test
+```
+
+To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm ` to delete them when they’re no longer needed.
 
 ### Troubleshooting
 
diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md
index c06632991..de99b6576 100644
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@@ -53,6 +53,13 @@ models:
   provider_id: ollama
   provider_model_id: null
 shields: []
+server:
+  port: 8321
+  auth:
+    provider_type: "kubernetes"
+    config:
+      api_server_url: "https://kubernetes.default.svc"
+      ca_cert_path: "/path/to/ca.crt"
 ```
 
 Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
@@ -102,6 +109,227 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i
 
 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
 
+## Server Configuration
+
+The `server` section configures the HTTP server that serves the Llama Stack APIs:
+
+```yaml
+server:
+  port: 8321  # Port to listen on (default: 8321)
+  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
+  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
+```
+
+### Authentication Configuration
+
+The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
+
+```
+Authorization: Bearer 
+```
+
+The server supports multiple authentication providers:
+
+#### OAuth 2.0/OpenID Connect Provider with Kubernetes
+
+The Kubernetes cluster must be configured to use a service account for authentication.
+
+```bash
+kubectl create namespace llama-stack
+kubectl create serviceaccount llama-stack-auth -n llama-stack
+kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
+kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
+```
+
+Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
+and that the correct RoleBinding is created to allow the service account to access the necessary
+resources. If that is not the case, you can create a RoleBinding for the service account to access
+the necessary resources:
+
+```yaml
+# allow-anonymous-openid.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: allow-anonymous-openid
+rules:
+- nonResourceURLs: ["/openid/v1/jwks"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: allow-anonymous-openid
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: allow-anonymous-openid
+subjects:
+- kind: User
+  name: system:anonymous
+  apiGroup: rbac.authorization.k8s.io
+```
+
+And then apply the configuration:
+```bash
+kubectl apply -f allow-anonymous-openid.yaml
+```
+
+Validates tokens against the Kubernetes API server through the OIDC provider:
+```yaml
+server:
+  auth:
+    provider_type: "oauth2_token"
+    config:
+      jwks:
+        uri: "https://kubernetes.default.svc"
+        key_recheck_period: 3600
+      tls_cafile: "/path/to/ca.crt"
+      issuer: "https://kubernetes.default.svc"
+      audience: "https://kubernetes.default.svc"
+```
+
+To find your cluster's audience, run:
+```bash
+kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
+```
+
+For the issuer, you can use the OIDC provider's URL:
+```bash
+kubectl get --raw /.well-known/openid-configuration| jq .issuer
+```
+
+For the tls_cafile, you can use the CA certificate of the OIDC provider:
+```bash
+kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
+```
+
+The provider extracts user information from the JWT token:
+- Username from the `sub` claim becomes a role
+- Kubernetes groups become teams
+
+You can easily validate a request by running:
+
+```bash
+curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
+```
+
+#### Custom Provider
+Validates tokens against a custom authentication endpoint:
+```yaml
+server:
+  auth:
+    provider_type: "custom"
+    config:
+      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
+```
+
+The custom endpoint receives a POST request with:
+```json
+{
+  "api_key": "",
+  "request": {
+    "path": "/api/v1/endpoint",
+    "headers": {
+      "content-type": "application/json",
+      "user-agent": "curl/7.64.1"
+    },
+    "params": {
+      "key": ["value"]
+    }
+  }
+}
+```
+
+And must respond with:
+```json
+{
+  "access_attributes": {
+    "roles": ["admin", "user"],
+    "teams": ["ml-team", "nlp-team"],
+    "projects": ["llama-3", "project-x"],
+    "namespaces": ["research"]
+  },
+  "message": "Authentication successful"
+}
+```
+
+If no access attributes are returned, the token is used as a namespace.
+
+### Quota Configuration
+
+The `quota` section allows you to enable server-side request throttling for both
+authenticated and anonymous clients. This is useful for preventing abuse, enforcing
+fairness across tenants, and controlling infrastructure costs without requiring
+client-side rate limiting or external proxies.
+
+Quotas are disabled by default. When enabled, each client is tracked using either:
+
+* Their authenticated `client_id` (derived from the Bearer token), or
+* Their IP address (fallback for anonymous requests)
+
+Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
+within a configurable time window (currently only `day` is supported).
+
+#### Example
+
+```yaml
+server:
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+#### Configuration Options
+
+| Field                        | Description                                                                |
+| ---------------------------- | -------------------------------------------------------------------------- |
+| `kvstore`                    | Required. Backend storage config for tracking request counts.              |
+| `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
+| `kvstore.db_path`            | File path to the SQLite database.                                          |
+| `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
+| `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
+| `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
+
+> Note: if `authenticated_max_requests` is set but no authentication provider is
+configured, the server will fall back to applying `anonymous_max_requests` to all
+clients.
+
+#### Example with Authentication Enabled
+
+```yaml
+server:
+  port: 8321
+  auth:
+    provider_type: custom
+    config:
+      endpoint: https://auth.example.com/validate
+  quota:
+    kvstore:
+      type: sqlite
+      db_path: ./quotas.db
+    anonymous_max_requests: 100
+    authenticated_max_requests: 1000
+    period: day
+```
+
+If a client exceeds their limit, the server responds with:
+
+```http
+HTTP/1.1 429 Too Many Requests
+Content-Type: application/json
+
+{
+  "error": {
+    "message": "Quota exceeded"
+  }
+}
+```
+
 ## Extending to handle Safety
 
 Configuring Safety can be a little involved so it is instructive to go through an example.
diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md
index 21ec02012..f43039824 100644
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@@ -172,7 +172,7 @@ spec:
       - name: llama-stack
         image: localhost/llama-stack-run-k8s:latest
         imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
         ports:
           - containerPort: 5000
         volumeMounts:
diff --git a/docs/source/distributions/remote_hosted_distro/watsonx.md b/docs/source/distributions/remote_hosted_distro/watsonx.md
index 018dc2a3c..ec1b98059 100644
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@@ -18,11 +18,11 @@ The `llamastack/distribution-watsonx` distribution consists of the following pro
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
-| inference | `remote::watsonx` |
+| inference | `remote::watsonx`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss` |
 
 
@@ -70,7 +70,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-watsonx \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env WATSONX_API_KEY=$WATSONX_API_KEY \
   --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md
index 302d6932b..d7aedbfb2 100644
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@@ -19,7 +19,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
 | safety | `remote::bedrock` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md
index 8f441823a..3c4db1b75 100644
--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@@ -12,7 +12,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -52,7 +52,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-cerebras \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md
index 96b0ef478..eded3bdc4 100644
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -155,7 +155,7 @@ docker run \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-dell \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env DEH_URL=$DEH_URL \
diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index ee9ddc818..d36e94748 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md
index b18be1b2f..1b2194ad8 100644
--- a/docs/source/distributions/self_hosted_distro/groq.md
+++ b/docs/source/distributions/self_hosted_distro/groq.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-groq` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index f58d7bbee..8b9dcec55 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md
index 4407de779..e84b5c525 100644
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@@ -6,7 +6,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| datasetio | `inline::localfs` |
+| datasetio | `inline::localfs`, `remote::nvidia` |
 | eval | `remote::nvidia` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
@@ -143,7 +143,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-nvidia \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 2358a52a7..4d148feda 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -19,10 +19,11 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
+| post_training | `inline::huggingface` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -97,7 +98,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-ollama \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env SAFETY_MODEL=$SAFETY_MODEL \
diff --git a/docs/source/distributions/self_hosted_distro/passthrough.md b/docs/source/distributions/self_hosted_distro/passthrough.md
index 04fc9d927..39f076be4 100644
--- a/docs/source/distributions/self_hosted_distro/passthrough.md
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-passthrough` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 46df56008..6e7cf410d 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -233,7 +233,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
   llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@@ -255,7 +255,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md
index 76b976d78..bb4842362 100644
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@@ -16,10 +16,10 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| inference | `remote::sambanova` |
-| safety | `inline::llama-guard` |
+| inference | `remote::sambanova`, `inline::sentence-transformers` |
+| safety | `remote::sambanova` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -28,53 +28,64 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 The following environment variables can be configured:
 
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
-- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
+- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
 
 ### Models
 
 The following models are available by default:
 
-- `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
-- `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
-- `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
-- `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
-- `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
-- `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
-- `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
-- `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
-- `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
-- `Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
+- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
 
 
 ### Prerequisite: API Keys
 
-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup).
 
 
 ## Running Llama Stack with SambaNova
 
 You can do this via Conda (build code) or Docker which has a pre-built image.
 
-### Via Docker
 
-This method allows you to get started quickly without having to build the distribution code.
+### Via Docker
 
 ```bash
 LLAMA_STACK_PORT=8321
+llama stack build --template sambanova --image-type container
 docker run \
   -it \
-  --pull always \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-sambanova \
+  -v ~/.llama:/root/.llama \
+  distribution-sambanova \
   --port $LLAMA_STACK_PORT \
   --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
 
+
+### Via Venv
+
+```bash
+llama stack build --template sambanova --image-type venv
+llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
+```
+
+
 ### Via Conda
 
 ```bash
 llama stack build --template sambanova --image-type conda
-llama stack run ./run.yaml \
+llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
index f6b14b064..24f9d03ec 100644
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -23,7 +23,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -117,7 +117,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-tgi \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index 3ebb1f59e..adfc2c472 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md
index a1504f249..e40a4903a 100644
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@@ -42,7 +42,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
 Setup your virtual environment.
 
 ```bash
-uv venv --python 3.10
+uv sync --python 3.10
 source .venv/bin/activate
 ```
 ## Step 2:  Run Llama Stack
@@ -445,7 +445,6 @@ from llama_stack_client import LlamaStackClient
 from llama_stack_client import Agent, AgentEventLogger
 from llama_stack_client.types import Document
 import uuid
-from termcolor import cprint
 
 client = LlamaStackClient(base_url="http://localhost:8321")
 
@@ -463,7 +462,6 @@ urls = [
     "memory_optimizations.rst",
     "chat.rst",
     "llama3.rst",
-    "datasets.rst",
     "qat_finetune.rst",
     "lora_finetune.rst",
 ]
diff --git a/docs/source/providers/external.md b/docs/source/providers/external.md
index 5aab5ee0f..55211ac5f 100644
--- a/docs/source/providers/external.md
+++ b/docs/source/providers/external.md
@@ -10,7 +10,7 @@ Llama Stack supports external providers that live outside of the main codebase.
 To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications:
 
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 
 ## Directory Structure
@@ -53,7 +53,9 @@ Here's a list of known external providers that you can use with Llama Stack:
 | Name | Description | API | Type | Repository |
 |------|-------------|-----|------|------------|
 | KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
+| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
+| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
 
 ### Remote Provider Specification
 
@@ -180,7 +182,7 @@ dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 3. Create the provider specification:
 
 ```yaml
-# /etc/llama-stack/providers.d/remote/inference/custom_ollama.yaml
+# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
 adapter:
   adapter_type: custom_ollama
   pip_packages: ["ollama", "aiohttp"]
@@ -199,7 +201,7 @@ uv pip install -e .
 5. Configure Llama Stack to use external providers:
 
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 
 The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
diff --git a/docs/source/providers/index.md b/docs/source/providers/index.md
index 1d1a6e081..1f5026479 100644
--- a/docs/source/providers/index.md
+++ b/docs/source/providers/index.md
@@ -30,6 +30,18 @@ Runs inference with an LLM.
 ## Post Training
 Fine-tunes a model.
 
+#### Post Training Providers
+The following providers are available for Post Training:
+
+```{toctree}
+:maxdepth: 1
+
+external
+post_training/huggingface
+post_training/torchtune
+post_training/nvidia_nemo
+```
+
 ## Safety
 Applies safety policies to the output at a Systems (not only model) level.
 
diff --git a/docs/source/providers/post_training/huggingface.md b/docs/source/providers/post_training/huggingface.md
new file mode 100644
index 000000000..c342203a8
--- /dev/null
+++ b/docs/source/providers/post_training/huggingface.md
@@ -0,0 +1,122 @@
+---
+orphan: true
+---
+# HuggingFace SFTTrainer
+
+[HuggingFace SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) is an inline post training provider for Llama Stack. It allows you to run supervised fine tuning on a variety of models using many datasets
+
+## Features
+
+- Simple access through the post_training API
+- Fully integrated with Llama Stack
+- GPU support, CPU support, and MPS support (MacOS Metal Performance Shaders)
+
+## Usage
+
+To use the HF SFTTrainer in your Llama Stack project, follow these steps:
+
+1. Configure your Llama Stack project to use this provider.
+2. Kick off a SFT job using the Llama Stack post_training API.
+
+## Setup
+
+You can access the HuggingFace trainer via the `ollama` distribution:
+
+```bash
+llama stack build --template ollama --image-type venv
+llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
+```
+
+## Run Training
+
+You can access the provider and the `supervised_fine_tune` method via the post_training API:
+
+```python
+import time
+import uuid
+
+
+from llama_stack_client.types import (
+    post_training_supervised_fine_tune_params,
+    algorithm_config_param,
+)
+
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+
+    return LlamaStackClient(base_url="http://localhost:8321")
+
+
+client = create_http_client()
+
+# Example Dataset
+client.datasets.register(
+    purpose="post-training/messages",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+    },
+    dataset_id="simpleqa",
+)
+
+training_config = post_training_supervised_fine_tune_params.TrainingConfig(
+    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
+        batch_size=32,
+        data_format="instruct",
+        dataset_id="simpleqa",
+        shuffle=True,
+    ),
+    gradient_accumulation_steps=1,
+    max_steps_per_epoch=0,
+    max_validation_steps=1,
+    n_epochs=4,
+)
+
+algorithm_config = algorithm_config_param.LoraFinetuningConfig(  # this config is also currently mandatory but should not be
+    alpha=1,
+    apply_lora_to_mlp=True,
+    apply_lora_to_output=False,
+    lora_attn_modules=["q_proj"],
+    rank=1,
+    type="LoRA",
+)
+
+job_uuid = f"test-job{uuid.uuid4()}"
+
+# Example Model
+training_model = "ibm-granite/granite-3.3-8b-instruct"
+
+start_time = time.time()
+response = client.post_training.supervised_fine_tune(
+    job_uuid=job_uuid,
+    logger_config={},
+    model=training_model,
+    hyperparam_search_config={},
+    training_config=training_config,
+    algorithm_config=algorithm_config,
+    checkpoint_dir="output",
+)
+print("Job: ", job_uuid)
+
+
+# Wait for the job to complete!
+while True:
+    status = client.post_training.job.status(job_uuid=job_uuid)
+    if not status:
+        print("Job not found")
+        break
+
+    print(status)
+    if status.status == "completed":
+        break
+
+    print("Waiting for job to complete...")
+    time.sleep(5)
+
+end_time = time.time()
+print("Job completed in", end_time - start_time, "seconds!")
+
+print("Artifacts:")
+print(client.post_training.job.artifacts(job_uuid=job_uuid))
+```
diff --git a/docs/source/providers/post_training/nvidia_nemo.md b/docs/source/providers/post_training/nvidia_nemo.md
new file mode 100644
index 000000000..1a7adbe16
--- /dev/null
+++ b/docs/source/providers/post_training/nvidia_nemo.md
@@ -0,0 +1,163 @@
+---
+orphan: true
+---
+# NVIDIA NEMO
+
+[NVIDIA NEMO](https://developer.nvidia.com/nemo-framework) is a remote post training provider for Llama Stack. It provides enterprise-grade fine-tuning capabilities through NVIDIA's NeMo Customizer service.
+
+## Features
+
+- Enterprise-grade fine-tuning capabilities
+- Support for LoRA and SFT fine-tuning
+- Integration with NVIDIA's NeMo Customizer service
+- Support for various NVIDIA-optimized models
+- Efficient training with NVIDIA hardware acceleration
+
+## Usage
+
+To use NVIDIA NEMO in your Llama Stack project, follow these steps:
+
+1. Configure your Llama Stack project to use this provider.
+2. Set up your NVIDIA API credentials.
+3. Kick off a fine-tuning job using the Llama Stack post_training API.
+
+## Setup
+
+You'll need to set the following environment variables:
+
+```bash
+export NVIDIA_API_KEY="your-api-key"
+export NVIDIA_DATASET_NAMESPACE="default"
+export NVIDIA_CUSTOMIZER_URL="your-customizer-url"
+export NVIDIA_PROJECT_ID="your-project-id"
+export NVIDIA_OUTPUT_MODEL_DIR="your-output-model-dir"
+```
+
+## Run Training
+
+You can access the provider and the `supervised_fine_tune` method via the post_training API:
+
+```python
+import time
+import uuid
+
+from llama_stack_client.types import (
+    post_training_supervised_fine_tune_params,
+    algorithm_config_param,
+)
+
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+
+    return LlamaStackClient(base_url="http://localhost:8321")
+
+
+client = create_http_client()
+
+# Example Dataset
+client.datasets.register(
+    purpose="post-training/messages",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+    },
+    dataset_id="simpleqa",
+)
+
+training_config = post_training_supervised_fine_tune_params.TrainingConfig(
+    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
+        batch_size=8,  # Default batch size for NEMO
+        data_format="instruct",
+        dataset_id="simpleqa",
+        shuffle=True,
+    ),
+    n_epochs=50,  # Default epochs for NEMO
+    optimizer_config=post_training_supervised_fine_tune_params.TrainingConfigOptimizerConfig(
+        lr=0.0001,  # Default learning rate
+        weight_decay=0.01,  # NEMO-specific parameter
+    ),
+    # NEMO-specific parameters
+    log_every_n_steps=None,
+    val_check_interval=0.25,
+    sequence_packing_enabled=False,
+    hidden_dropout=None,
+    attention_dropout=None,
+    ffn_dropout=None,
+)
+
+algorithm_config = algorithm_config_param.LoraFinetuningConfig(
+    alpha=16,  # Default alpha for NEMO
+    type="LoRA",
+)
+
+job_uuid = f"test-job{uuid.uuid4()}"
+
+# Example Model - must be a supported NEMO model
+training_model = "meta/llama-3.1-8b-instruct"
+
+start_time = time.time()
+response = client.post_training.supervised_fine_tune(
+    job_uuid=job_uuid,
+    logger_config={},
+    model=training_model,
+    hyperparam_search_config={},
+    training_config=training_config,
+    algorithm_config=algorithm_config,
+    checkpoint_dir="output",
+)
+print("Job: ", job_uuid)
+
+# Wait for the job to complete!
+while True:
+    status = client.post_training.job.status(job_uuid=job_uuid)
+    if not status:
+        print("Job not found")
+        break
+
+    print(status)
+    if status.status == "completed":
+        break
+
+    print("Waiting for job to complete...")
+    time.sleep(5)
+
+end_time = time.time()
+print("Job completed in", end_time - start_time, "seconds!")
+
+print("Artifacts:")
+print(client.post_training.job.artifacts(job_uuid=job_uuid))
+```
+
+## Supported Models
+
+Currently supports the following models:
+- meta/llama-3.1-8b-instruct
+- meta/llama-3.2-1b-instruct
+
+## Supported Parameters
+
+### TrainingConfig
+- n_epochs (default: 50)
+- data_config
+- optimizer_config
+- log_every_n_steps
+- val_check_interval (default: 0.25)
+- sequence_packing_enabled (default: False)
+- hidden_dropout (0.0-1.0)
+- attention_dropout (0.0-1.0)
+- ffn_dropout (0.0-1.0)
+
+### DataConfig
+- dataset_id
+- batch_size (default: 8)
+
+### OptimizerConfig
+- lr (default: 0.0001)
+- weight_decay (default: 0.01)
+
+### LoRA Config
+- alpha (default: 16)
+- type (must be "LoRA")
+
+Note: Some parameters from the standard Llama Stack API are not supported and will be ignored with a warning.
diff --git a/docs/source/providers/post_training/torchtune.md b/docs/source/providers/post_training/torchtune.md
new file mode 100644
index 000000000..ef72505b1
--- /dev/null
+++ b/docs/source/providers/post_training/torchtune.md
@@ -0,0 +1,125 @@
+---
+orphan: true
+---
+# TorchTune
+
+[TorchTune](https://github.com/pytorch/torchtune) is an inline post training provider for Llama Stack. It provides a simple and efficient way to fine-tune language models using PyTorch.
+
+## Features
+
+- Simple access through the post_training API
+- Fully integrated with Llama Stack
+- GPU support and single device capabilities.
+- Support for LoRA
+
+## Usage
+
+To use TorchTune in your Llama Stack project, follow these steps:
+
+1. Configure your Llama Stack project to use this provider.
+2. Kick off a fine-tuning job using the Llama Stack post_training API.
+
+## Setup
+
+You can access the TorchTune trainer by writing your own yaml pointing to the provider:
+
+```yaml
+post_training:
+  - provider_id: torchtune
+    provider_type: inline::torchtune
+    config: {}
+```
+
+you can then build and run your own stack with this provider.
+
+## Run Training
+
+You can access the provider and the `supervised_fine_tune` method via the post_training API:
+
+```python
+import time
+import uuid
+
+from llama_stack_client.types import (
+    post_training_supervised_fine_tune_params,
+    algorithm_config_param,
+)
+
+
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+
+    return LlamaStackClient(base_url="http://localhost:8321")
+
+
+client = create_http_client()
+
+# Example Dataset
+client.datasets.register(
+    purpose="post-training/messages",
+    source={
+        "type": "uri",
+        "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
+    },
+    dataset_id="simpleqa",
+)
+
+training_config = post_training_supervised_fine_tune_params.TrainingConfig(
+    data_config=post_training_supervised_fine_tune_params.TrainingConfigDataConfig(
+        batch_size=32,
+        data_format="instruct",
+        dataset_id="simpleqa",
+        shuffle=True,
+    ),
+    gradient_accumulation_steps=1,
+    max_steps_per_epoch=0,
+    max_validation_steps=1,
+    n_epochs=4,
+)
+
+algorithm_config = algorithm_config_param.LoraFinetuningConfig(
+    alpha=1,
+    apply_lora_to_mlp=True,
+    apply_lora_to_output=False,
+    lora_attn_modules=["q_proj"],
+    rank=1,
+    type="LoRA",
+)
+
+job_uuid = f"test-job{uuid.uuid4()}"
+
+# Example Model
+training_model = "meta-llama/Llama-2-7b-hf"
+
+start_time = time.time()
+response = client.post_training.supervised_fine_tune(
+    job_uuid=job_uuid,
+    logger_config={},
+    model=training_model,
+    hyperparam_search_config={},
+    training_config=training_config,
+    algorithm_config=algorithm_config,
+    checkpoint_dir="output",
+)
+print("Job: ", job_uuid)
+
+# Wait for the job to complete!
+while True:
+    status = client.post_training.job.status(job_uuid=job_uuid)
+    if not status:
+        print("Job not found")
+        break
+
+    print(status)
+    if status.status == "completed":
+        break
+
+    print("Waiting for job to complete...")
+    time.sleep(5)
+
+end_time = time.time()
+print("Job completed in", end_time - start_time, "seconds!")
+
+print("Artifacts:")
+print(client.post_training.job.artifacts(job_uuid=job_uuid))
+```
diff --git a/docs/source/providers/vector_io/milvus.md b/docs/source/providers/vector_io/milvus.md
new file mode 100644
index 000000000..e030c85f8
--- /dev/null
+++ b/docs/source/providers/vector_io/milvus.md
@@ -0,0 +1,107 @@
+---
+orphan: true
+---
+# Milvus
+
+[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
+allows you to store and query vectors directly within a Milvus database.
+That means you're not limited to storing vectors in memory or in a separate service.
+
+## Features
+
+- Easy to use
+- Fully integrated with Llama Stack
+
+## Usage
+
+To use Milvus in your Llama Stack project, follow these steps:
+
+1. Install the necessary dependencies.
+2. Configure your Llama Stack project to use Milvus.
+3. Start storing and querying vectors.
+
+## Installation
+
+You can install Milvus using pymilvus:
+
+```bash
+pip install pymilvus
+```
+
+## Configuration
+
+In Llama Stack, Milvus can be configured in two ways:
+- **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
+- **Remote Configuration** - Connects to a remote Milvus server
+
+### Inline (Local) Configuration
+
+The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
+
+```yaml
+vector_io:
+  - provider_id: milvus
+    provider_type: inline::milvus
+    config:
+      db_path: ~/.llama/distributions/together/milvus_store.db
+```
+
+### Remote Configuration
+
+Remote configuration is suitable for larger data storage requirements:
+
+#### Standard Remote Connection
+
+```yaml
+vector_io:
+  - provider_id: milvus
+    provider_type: remote::milvus
+    config:
+      uri: "http://:"
+      token: ":"
+```
+
+#### TLS-Enabled Remote Connection (One-way TLS)
+
+For connections to Milvus instances with one-way TLS enabled:
+
+```yaml
+vector_io:
+  - provider_id: milvus
+    provider_type: remote::milvus
+    config:
+      uri: "https://:"
+      token: ":"
+      secure: True
+      server_pem_path: "/path/to/server.pem"
+```
+
+#### Mutual TLS (mTLS) Remote Connection
+
+For connections to Milvus instances with mutual TLS (mTLS) enabled:
+
+```yaml
+vector_io:
+  - provider_id: milvus
+    provider_type: remote::milvus
+    config:
+      uri: "https://:"
+      token: ":"
+      secure: True
+      ca_pem_path: "/path/to/ca.pem"
+      client_pem_path: "/path/to/client.pem"
+      client_key_path: "/path/to/client.key"
+```
+
+#### Key Parameters for TLS Configuration
+
+- **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
+- **`server_pem_path`**: Path to the **server certificate** for verifying the server’s identity (used in one-way TLS).
+- **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
+- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
+- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
+
+## Documentation
+See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
+
+For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
diff --git a/docs/source/providers/vector_io/mivus.md b/docs/source/providers/vector_io/mivus.md
deleted file mode 100644
index 8d2f043d5..000000000
--- a/docs/source/providers/vector_io/mivus.md
+++ /dev/null
@@ -1,31 +0,0 @@
----
-orphan: true
----
-# Milvus
-
-[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
-allows you to store and query vectors directly within a Milvus database.
-That means you're not limited to storing vectors in memory or in a separate service.
-
-## Features
-
-- Easy to use
-- Fully integrated with Llama Stack
-
-## Usage
-
-To use Milvus in your Llama Stack project, follow these steps:
-
-1. Install the necessary dependencies.
-2. Configure your Llama Stack project to use Milvus.
-3. Start storing and querying vectors.
-
-## Installation
-
-You can install Milvus using pymilvus:
-
-```bash
-pip install pymilvus
-```
-## Documentation
-See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
diff --git a/docs/source/providers/vector_io/sqlite-vec.md b/docs/source/providers/vector_io/sqlite-vec.md
index 43d10c751..49ba659f7 100644
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@@ -66,6 +66,25 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use SQLite-Vec.
 3. Start storing and querying vectors.
 
+## Supported Search Modes
+
+The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
+
+When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
+`RAGQueryConfig`. For example:
+
+```python
+from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
+
+query_config = RAGQueryConfig(max_chunks=6, mode="vector")
+
+results = client.tool_runtime.rag_tool.query(
+    vector_db_ids=[vector_db_id],
+    content="what is torchtune",
+    query_config=query_config,
+)
+```
+
 ## Installation
 
 You can install SQLite-Vec using pip:
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index 0b84027f0..cd4dd4cd7 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -253,8 +253,6 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
-| builtin::code_interpreter | code-interpreter | None | None          |
-+---------------------------+------------------+------+---------------+
 | builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::websearch        | tavily-search    | None | None          |
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 9f756de26..96f9768de 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -86,11 +86,11 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
    llama stack build --template ollama --image-type conda
    ```
    **Expected Output:**
-   ```
+   ```bash
    ...
-   Build Successful! Next steps:
-   1. Set the environment variables: LLAMA_STACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
-   2. `llama stack run /Users//.llama/distributions/llamastack-ollama/ollama-run.yaml
+   Build Successful!
+   You can find the newly-built template here: ~/.llama/distributions/ollama/ollama-run.yaml
+   You can run the new Llama Stack Distro via: llama stack run ~/.llama/distributions/ollama/ollama-run.yaml --image-type conda
    ```
 
 3. **Set the ENV variables by exporting them to the terminal**:
diff --git a/install.sh b/install.sh
new file mode 100755
index 000000000..e424925a6
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,206 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -Eeuo pipefail
+
+PORT=8321
+OLLAMA_PORT=11434
+MODEL_ALIAS="llama3.2:3b"
+SERVER_IMAGE="llamastack/distribution-ollama:0.2.2"
+WAIT_TIMEOUT=300
+
+log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
+die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
+
+wait_for_service() {
+  local url="$1"
+  local pattern="$2"
+  local timeout="$3"
+  local name="$4"
+  local start ts
+  log "⏳  Waiting for ${name}…"
+  start=$(date +%s)
+  while true; do
+    if curl --retry 5 --retry-delay 1 --retry-max-time "$timeout" --retry-all-errors --silent --fail "$url" 2>/dev/null | grep -q "$pattern"; then
+      break
+    fi
+    ts=$(date +%s)
+    if (( ts - start >= timeout )); then
+      return 1
+    fi
+    printf '.'
+    sleep 1
+  done
+  return 0
+}
+
+usage() {
+    cat << EOF
+📚 Llama-Stack Deployment Script
+
+Description:
+    This script sets up and deploys Llama-Stack with Ollama integration in containers.
+    It handles both Docker and Podman runtimes and includes automatic platform detection.
+
+Usage:
+    $(basename "$0") [OPTIONS]
+
+Options:
+    -p, --port PORT            Server port for Llama-Stack (default: ${PORT})
+    -o, --ollama-port PORT     Ollama service port (default: ${OLLAMA_PORT})
+    -m, --model MODEL          Model alias to use (default: ${MODEL_ALIAS})
+    -i, --image IMAGE          Server image (default: ${SERVER_IMAGE})
+    -t, --timeout SECONDS      Service wait timeout in seconds (default: ${WAIT_TIMEOUT})
+    -h, --help                 Show this help message
+
+For more information:
+    Documentation: https://llama-stack.readthedocs.io/
+    GitHub: https://github.com/meta-llama/llama-stack
+
+Report issues:
+    https://github.com/meta-llama/llama-stack/issues
+EOF
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -p|--port)
+            PORT="$2"
+            shift 2
+            ;;
+        -o|--ollama-port)
+            OLLAMA_PORT="$2"
+            shift 2
+            ;;
+        -m|--model)
+            MODEL_ALIAS="$2"
+            shift 2
+            ;;
+        -i|--image)
+            SERVER_IMAGE="$2"
+            shift 2
+            ;;
+        -t|--timeout)
+            WAIT_TIMEOUT="$2"
+            shift 2
+            ;;
+        *)
+            die "Unknown option: $1"
+            ;;
+    esac
+done
+
+if command -v docker &> /dev/null; then
+  ENGINE="docker"
+elif command -v podman &> /dev/null; then
+  ENGINE="podman"
+else
+  die "Docker or Podman is required. Install Docker: https://docs.docker.com/get-docker/ or Podman: https://podman.io/getting-started/installation"
+fi
+
+# Explicitly set the platform for the host architecture
+HOST_ARCH="$(uname -m)"
+if [ "$HOST_ARCH" = "arm64" ]; then
+  if [ "$ENGINE" = "docker" ]; then
+    PLATFORM_OPTS=( --platform linux/amd64 )
+  else
+    PLATFORM_OPTS=( --os linux --arch amd64 )
+  fi
+else
+  PLATFORM_OPTS=()
+fi
+
+# macOS + Podman: ensure VM is running before we try to launch containers
+# If you need GPU passthrough under Podman on macOS, init the VM with libkrun:
+#   CONTAINERS_MACHINE_PROVIDER=libkrun podman machine init
+if [ "$ENGINE" = "podman" ] && [ "$(uname -s)" = "Darwin" ]; then
+  if ! podman info &>/dev/null; then
+    log "⌛️ Initializing Podman VM…"
+    podman machine init &>/dev/null || true
+    podman machine start &>/dev/null || true
+
+    log "⌛️  Waiting for Podman API…"
+    until podman info &>/dev/null; do
+      sleep 1
+    done
+    log "✅  Podman VM is up"
+  fi
+fi
+
+# Clean up any leftovers from earlier runs
+for name in ollama-server llama-stack; do
+  ids=$($ENGINE ps -aq --filter "name=^${name}$")
+  if [ -n "$ids" ]; then
+    log "⚠️   Found existing container(s) for '${name}', removing…"
+    $ENGINE rm -f "$ids" > /dev/null 2>&1
+  fi
+done
+
+###############################################################################
+# 0. Create a shared network
+###############################################################################
+if ! $ENGINE network inspect llama-net >/dev/null 2>&1; then
+  log "🌐  Creating network…"
+  $ENGINE network create llama-net >/dev/null 2>&1
+fi
+
+###############################################################################
+# 1. Ollama
+###############################################################################
+log "🦙  Starting Ollama…"
+$ENGINE run -d "${PLATFORM_OPTS[@]}" --name ollama-server \
+  --network llama-net \
+  -p "${OLLAMA_PORT}:${OLLAMA_PORT}" \
+  ollama/ollama > /dev/null 2>&1
+
+if ! wait_for_service "http://localhost:${OLLAMA_PORT}/" "Ollama" "$WAIT_TIMEOUT" "Ollama daemon"; then
+  log "❌  Ollama daemon did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
+  $ENGINE logs --tail 200 ollama-server
+  die "Ollama startup failed"
+fi
+
+log "📦  Ensuring model is pulled: ${MODEL_ALIAS}…"
+if ! $ENGINE exec ollama-server ollama pull "${MODEL_ALIAS}" > /dev/null 2>&1; then
+  log "❌  Failed to pull model ${MODEL_ALIAS}; dumping container logs:"
+  $ENGINE logs --tail 200 ollama-server
+  die "Model pull failed"
+fi
+
+###############################################################################
+# 2. Llama‑Stack
+###############################################################################
+cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
+      --network llama-net \
+      -p "${PORT}:${PORT}" \
+      "${SERVER_IMAGE}" --port "${PORT}" \
+      --env INFERENCE_MODEL="${MODEL_ALIAS}" \
+      --env OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" )
+
+log "🦙  Starting Llama‑Stack…"
+$ENGINE "${cmd[@]}" > /dev/null 2>&1
+
+if ! wait_for_service "http://127.0.0.1:${PORT}/v1/health" "OK" "$WAIT_TIMEOUT" "Llama-Stack API"; then
+  log "❌  Llama-Stack did not become ready in ${WAIT_TIMEOUT}s; dumping container logs:"
+  $ENGINE logs --tail 200 llama-stack
+  die "Llama-Stack startup failed"
+fi
+
+###############################################################################
+# Done
+###############################################################################
+log ""
+log "🎉  Llama‑Stack is ready!"
+log "👉  API endpoint: http://localhost:${PORT}"
+log "📖 Documentation: https://llama-stack.readthedocs.io/en/latest/references/index.html"
+log "💻 To access the llama‑stack CLI, exec into the container:"
+log "   $ENGINE exec -ti llama-stack bash"
+log ""
diff --git a/kvant_build_local.sh b/kvant_build_local.sh
new file mode 100755
index 000000000..9701c57dc
--- /dev/null
+++ b/kvant_build_local.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+export USE_COPY_NOT_MOUNT=true
+export LLAMA_STACK_DIR=.
+
+uvx --from . llama stack build --template kvant --image-type container  --image-name kvant
diff --git a/kvant_start_local.sh b/kvant_start_local.sh
new file mode 100755
index 000000000..db5bff84a
--- /dev/null
+++ b/kvant_start_local.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+export LLAMA_STACK_PORT=8321
+# VLLM_API_TOKEN= env file
+# KEYCLOAK_CLIENT_SECRET= env file
+
+
+docker run -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $(pwd)/data:/root/.llama \
+  --mount type=bind,source="$(pwd)"/llama_stack/templates/kvant/run.yaml,target=/root/.llama/config.yaml,readonly \
+  --entrypoint python \
+  --env-file ./.env \
+  distribution-kvant:dev \
+  -m llama_stack.distribution.server.server  --config /root/.llama/config.yaml \
+  --port $LLAMA_STACK_PORT \
+
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index dec43280b..b79c512b8 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -4,24 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
+from collections.abc import AsyncIterator
 from datetime import datetime
 from enum import Enum
-from typing import (
-    Annotated,
-    Any,
-    AsyncIterator,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Protocol,
-    Union,
-    runtime_checkable,
-)
+from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 
 from pydantic import BaseModel, ConfigDict, Field
 
 from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
+from llama_stack.apis.common.responses import Order, PaginatedResponse
 from llama_stack.apis.inference import (
     CompletionMessage,
     ResponseFormat,
@@ -38,6 +30,23 @@ from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
+from .openai_responses import (
+    ListOpenAIResponseInputItem,
+    ListOpenAIResponseObject,
+    OpenAIResponseInput,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+)
+
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
 
 class Attachment(BaseModel):
     """An attachment to an agent turn.
@@ -72,11 +81,11 @@ class StepCommon(BaseModel):
 
     turn_id: str
     step_id: str
-    started_at: Optional[datetime] = None
-    completed_at: Optional[datetime] = None
+    started_at: datetime | None = None
+    completed_at: datetime | None = None
 
 
-class StepType(Enum):
+class StepType(StrEnum):
     """Type of the step in an agent turn.
 
     :cvar inference: The step is an inference step that calls an LLM.
@@ -100,7 +109,7 @@ class InferenceStep(StepCommon):
 
     model_config = ConfigDict(protected_namespaces=())
 
-    step_type: Literal[StepType.inference.value] = StepType.inference.value
+    step_type: Literal[StepType.inference] = StepType.inference
     model_response: CompletionMessage
 
 
@@ -112,9 +121,9 @@ class ToolExecutionStep(StepCommon):
     :param tool_responses: The tool responses from the tool calls.
     """
 
-    step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
-    tool_calls: List[ToolCall]
-    tool_responses: List[ToolResponse]
+    step_type: Literal[StepType.tool_execution] = StepType.tool_execution
+    tool_calls: list[ToolCall]
+    tool_responses: list[ToolResponse]
 
 
 @json_schema_type
@@ -124,8 +133,8 @@ class ShieldCallStep(StepCommon):
     :param violation: The violation from the shield call.
     """
 
-    step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
-    violation: Optional[SafetyViolation]
+    step_type: Literal[StepType.shield_call] = StepType.shield_call
+    violation: SafetyViolation | None
 
 
 @json_schema_type
@@ -136,19 +145,14 @@ class MemoryRetrievalStep(StepCommon):
     :param inserted_context: The context retrieved from the vector databases.
     """
 
-    step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
+    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
     # TODO: should this be List[str]?
     vector_db_ids: str
     inserted_context: InterleavedContent
 
 
 Step = Annotated[
-    Union[
-        InferenceStep,
-        ToolExecutionStep,
-        ShieldCallStep,
-        MemoryRetrievalStep,
-    ],
+    InferenceStep | ToolExecutionStep | ShieldCallStep | MemoryRetrievalStep,
     Field(discriminator="step_type"),
 ]
 
@@ -159,18 +163,13 @@ class Turn(BaseModel):
 
     turn_id: str
     session_id: str
-    input_messages: List[
-        Union[
-            UserMessage,
-            ToolResponseMessage,
-        ]
-    ]
-    steps: List[Step]
+    input_messages: list[UserMessage | ToolResponseMessage]
+    steps: list[Step]
     output_message: CompletionMessage
-    output_attachments: Optional[List[Attachment]] = Field(default_factory=list)
+    output_attachments: list[Attachment] | None = Field(default_factory=lambda: [])
 
     started_at: datetime
-    completed_at: Optional[datetime] = None
+    completed_at: datetime | None = None
 
 
 @json_schema_type
@@ -179,34 +178,31 @@ class Session(BaseModel):
 
     session_id: str
     session_name: str
-    turns: List[Turn]
+    turns: list[Turn]
     started_at: datetime
 
 
 class AgentToolGroupWithArgs(BaseModel):
     name: str
-    args: Dict[str, Any]
+    args: dict[str, Any]
 
 
-AgentToolGroup = Union[
-    str,
-    AgentToolGroupWithArgs,
-]
+AgentToolGroup = str | AgentToolGroupWithArgs
 register_schema(AgentToolGroup, name="AgentTool")
 
 
 class AgentConfigCommon(BaseModel):
-    sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
+    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
 
-    input_shields: Optional[List[str]] = Field(default_factory=list)
-    output_shields: Optional[List[str]] = Field(default_factory=list)
-    toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
-    client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=None, deprecated="use tool_config instead")
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None, deprecated="use tool_config instead")
-    tool_config: Optional[ToolConfig] = Field(default=None)
+    input_shields: list[str] | None = Field(default_factory=lambda: [])
+    output_shields: list[str] | None = Field(default_factory=lambda: [])
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
+    client_tools: list[ToolDef] | None = Field(default_factory=lambda: [])
+    tool_choice: ToolChoice | None = Field(default=None, deprecated="use tool_config instead")
+    tool_prompt_format: ToolPromptFormat | None = Field(default=None, deprecated="use tool_config instead")
+    tool_config: ToolConfig | None = Field(default=None)
 
-    max_infer_iters: Optional[int] = 10
+    max_infer_iters: int | None = 10
 
     def model_post_init(self, __context):
         if self.tool_config:
@@ -236,9 +232,9 @@ class AgentConfig(AgentConfigCommon):
 
     model: str
     instructions: str
-    name: Optional[str] = None
-    enable_session_persistence: Optional[bool] = False
-    response_format: Optional[ResponseFormat] = None
+    name: str | None = None
+    enable_session_persistence: bool | None = False
+    response_format: ResponseFormat | None = None
 
 
 @json_schema_type
@@ -248,21 +244,11 @@ class Agent(BaseModel):
     created_at: datetime
 
 
-@json_schema_type
-class ListAgentsResponse(BaseModel):
-    data: List[Agent]
-
-
-@json_schema_type
-class ListAgentSessionsResponse(BaseModel):
-    data: List[Session]
-
-
 class AgentConfigOverridablePerTurn(AgentConfigCommon):
-    instructions: Optional[str] = None
+    instructions: str | None = None
 
 
-class AgentTurnResponseEventType(Enum):
+class AgentTurnResponseEventType(StrEnum):
     step_start = "step_start"
     step_complete = "step_complete"
     step_progress = "step_progress"
@@ -274,15 +260,15 @@ class AgentTurnResponseEventType(Enum):
 
 @json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
+    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
     step_type: StepType
     step_id: str
-    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
+    metadata: dict[str, Any] | None = Field(default_factory=lambda: {})
 
 
 @json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
+    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
     step_type: StepType
     step_id: str
     step_details: Step
@@ -292,7 +278,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
 class AgentTurnResponseStepProgressPayload(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
 
-    event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
+    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
     step_type: StepType
     step_id: str
 
@@ -301,33 +287,29 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
 
 @json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
+    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
     turn_id: str
 
 
 @json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
+    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
     turn: Turn
 
 
 @json_schema_type
 class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input.value] = (
-        AgentTurnResponseEventType.turn_awaiting_input.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
     turn: Turn
 
 
 AgentTurnResponseEventPayload = Annotated[
-    Union[
-        AgentTurnResponseStepStartPayload,
-        AgentTurnResponseStepProgressPayload,
-        AgentTurnResponseStepCompletePayload,
-        AgentTurnResponseTurnStartPayload,
-        AgentTurnResponseTurnCompletePayload,
-        AgentTurnResponseTurnAwaitingInputPayload,
-    ],
+    AgentTurnResponseStepStartPayload
+    | AgentTurnResponseStepProgressPayload
+    | AgentTurnResponseStepCompletePayload
+    | AgentTurnResponseTurnStartPayload
+    | AgentTurnResponseTurnCompletePayload
+    | AgentTurnResponseTurnAwaitingInputPayload,
     Field(discriminator="event_type"),
 ]
 register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPayload")
@@ -356,18 +338,13 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
     # TODO: figure out how we can simplify this and make why
     # ToolResponseMessage needs to be here (it is function call
     # execution from outside the system)
-    messages: List[
-        Union[
-            UserMessage,
-            ToolResponseMessage,
-        ]
-    ]
+    messages: list[UserMessage | ToolResponseMessage]
 
-    documents: Optional[List[Document]] = None
-    toolgroups: Optional[List[AgentToolGroup]] = None
+    documents: list[Document] | None = None
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
 
-    stream: Optional[bool] = False
-    tool_config: Optional[ToolConfig] = None
+    stream: bool | None = False
+    tool_config: ToolConfig | None = None
 
 
 @json_schema_type
@@ -375,8 +352,8 @@ class AgentTurnResumeRequest(BaseModel):
     agent_id: str
     session_id: str
     turn_id: str
-    tool_responses: List[ToolResponse]
-    stream: Optional[bool] = False
+    tool_responses: list[ToolResponse]
+    stream: bool | None = False
 
 
 @json_schema_type
@@ -422,17 +399,12 @@ class Agents(Protocol):
         self,
         agent_id: str,
         session_id: str,
-        messages: List[
-            Union[
-                UserMessage,
-                ToolResponseMessage,
-            ]
-        ],
-        stream: Optional[bool] = False,
-        documents: Optional[List[Document]] = None,
-        toolgroups: Optional[List[AgentToolGroup]] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
+        messages: list[UserMessage | ToolResponseMessage],
+        stream: bool | None = False,
+        documents: list[Document] | None = None,
+        toolgroups: list[AgentToolGroup] | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
         """Create a new turn for an agent.
 
         :param agent_id: The ID of the agent to create the turn for.
@@ -443,8 +415,9 @@ class Agents(Protocol):
         :param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
         :param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
         :returns: If stream=False, returns a Turn object.
-                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk
+                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.
         """
+        ...
 
     @webmethod(
         route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
@@ -456,9 +429,9 @@ class Agents(Protocol):
         agent_id: str,
         session_id: str,
         turn_id: str,
-        tool_responses: List[ToolResponse],
-        stream: Optional[bool] = False,
-    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
+        tool_responses: list[ToolResponse],
+        stream: bool | None = False,
+    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
         """Resume an agent turn with executed tool call responses.
 
         When a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready.
@@ -531,13 +504,14 @@ class Agents(Protocol):
         self,
         session_id: str,
         agent_id: str,
-        turn_ids: Optional[List[str]] = None,
+        turn_ids: list[str] | None = None,
     ) -> Session:
         """Retrieve an agent session by its ID.
 
         :param session_id: The ID of the session to get.
         :param agent_id: The ID of the agent to get the session for.
         :param turn_ids: (Optional) List of turn IDs to filter the session by.
+        :returns: A Session.
         """
         ...
 
@@ -547,7 +521,7 @@ class Agents(Protocol):
         session_id: str,
         agent_id: str,
     ) -> None:
-        """Delete an agent session by its ID.
+        """Delete an agent session by its ID and its associated turns.
 
         :param session_id: The ID of the session to delete.
         :param agent_id: The ID of the agent to delete the session for.
@@ -559,17 +533,19 @@ class Agents(Protocol):
         self,
         agent_id: str,
     ) -> None:
-        """Delete an agent by its ID.
+        """Delete an agent by its ID and its associated sessions and turns.
 
         :param agent_id: The ID of the agent to delete.
         """
         ...
 
     @webmethod(route="/agents", method="GET")
-    async def list_agents(self) -> ListAgentsResponse:
+    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
         """List all agents.
 
-        :returns: A ListAgentsResponse.
+        :param start_index: The index to start the pagination from.
+        :param limit: The number of agents to return.
+        :returns: A PaginatedResponse.
         """
         ...
 
@@ -586,10 +562,94 @@ class Agents(Protocol):
     async def list_agent_sessions(
         self,
         agent_id: str,
-    ) -> ListAgentSessionsResponse:
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
         """List all session(s) of a given agent.
 
         :param agent_id: The ID of the agent to list sessions for.
-        :returns: A ListAgentSessionsResponse.
+        :param start_index: The index to start the pagination from.
+        :param limit: The number of sessions to return.
+        :returns: A PaginatedResponse.
+        """
+        ...
+
+    # We situate the OpenAI Responses API in the Agents API just like we did things
+    # for Inference. The Responses API, in its intent, serves the same purpose as
+    # the Agents API above -- it is essentially a lightweight "agentic loop" with
+    # integrated tool calling.
+    #
+    # Both of these APIs are inherently stateful.
+
+    @webmethod(route="/openai/v1/responses/{response_id}", method="GET")
+    async def get_openai_response(
+        self,
+        response_id: str,
+    ) -> OpenAIResponseObject:
+        """Retrieve an OpenAI response by its ID.
+
+        :param response_id: The ID of the OpenAI response to retrieve.
+        :returns: An OpenAIResponseObject.
+        """
+        ...
+
+    @webmethod(route="/openai/v1/responses", method="POST")
+    async def create_openai_response(
+        self,
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        instructions: str | None = None,
+        previous_response_id: str | None = None,
+        store: bool | None = True,
+        stream: bool | None = False,
+        temperature: float | None = None,
+        tools: list[OpenAIResponseInputTool] | None = None,
+    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
+        """Create a new OpenAI response.
+
+        :param input: Input message(s) to create the response.
+        :param model: The underlying LLM used for completions.
+        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
+        :returns: An OpenAIResponseObject.
+        """
+        ...
+
+    @webmethod(route="/openai/v1/responses", method="GET")
+    async def list_openai_responses(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseObject:
+        """List all OpenAI responses.
+
+        :param after: The ID of the last response to return.
+        :param limit: The number of responses to return.
+        :param model: The model to filter responses by.
+        :param order: The order to sort responses by when sorted by created_at ('asc' or 'desc').
+        :returns: A ListOpenAIResponseObject.
+        """
+        ...
+
+    @webmethod(route="/openai/v1/responses/{response_id}/input_items", method="GET")
+    async def list_openai_response_input_items(
+        self,
+        response_id: str,
+        after: str | None = None,
+        before: str | None = None,
+        include: list[str] | None = None,
+        limit: int | None = 20,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseInputItem:
+        """List input items for a given OpenAI response.
+
+        :param response_id: The ID of the response to retrieve input items for.
+        :param after: An item ID to list items after, used for pagination.
+        :param before: An item ID to list items before, used for pagination.
+        :param include: Additional fields to include in the response.
+        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
+        :param order: The order to return the input items in. Default is desc.
+        :returns: An ListOpenAIResponseInputItem.
         """
         ...
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
new file mode 100644
index 000000000..6806e1d3f
--- /dev/null
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -0,0 +1,279 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Annotated, Any, Literal
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type, register_schema
+
+# NOTE(ashwin): this file is literally a copy of the OpenAI responses API schema. We should probably
+# take their YAML and generate this file automatically. Their YAML is available.
+
+
+@json_schema_type
+class OpenAIResponseError(BaseModel):
+    code: str
+    message: str
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentText(BaseModel):
+    text: str
+    type: Literal["input_text"] = "input_text"
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentImage(BaseModel):
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
+    type: Literal["input_image"] = "input_image"
+    # TODO: handle file_id
+    image_url: str | None = None
+
+
+# TODO: handle file content types
+OpenAIResponseInputMessageContent = Annotated[
+    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageContentOutputText(BaseModel):
+    text: str
+    type: Literal["output_text"] = "output_text"
+
+
+OpenAIResponseOutputMessageContent = Annotated[
+    OpenAIResponseOutputMessageContentOutputText,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
+
+
+@json_schema_type
+class OpenAIResponseMessage(BaseModel):
+    """
+    Corresponds to the various Message types in the Responses API.
+    They are all under one type because the Responses API gives them all
+    the same "type" value, and there is no way to tell them apart in certain
+    scenarios.
+    """
+
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
+    type: Literal["message"] = "message"
+
+    # The fields below are not used in all scenarios, but are required in others.
+    id: str | None = None
+    status: str | None = None
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
+    id: str
+    status: str
+    type: Literal["web_search_call"] = "web_search_call"
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
+    call_id: str
+    name: str
+    arguments: str
+    type: Literal["function_call"] = "function_call"
+    id: str | None = None
+    status: str | None = None
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageMCPCall(BaseModel):
+    id: str
+    type: Literal["mcp_call"] = "mcp_call"
+    arguments: str
+    name: str
+    server_label: str
+    error: str | None = None
+    output: str | None = None
+
+
+class MCPListToolsTool(BaseModel):
+    input_schema: dict[str, Any]
+    name: str
+    description: str | None = None
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageMCPListTools(BaseModel):
+    id: str
+    type: Literal["mcp_list_tools"] = "mcp_list_tools"
+    server_label: str
+    tools: list[MCPListToolsTool]
+
+
+OpenAIResponseOutput = Annotated[
+    OpenAIResponseMessage
+    | OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFunctionToolCall
+    | OpenAIResponseOutputMessageMCPCall
+    | OpenAIResponseOutputMessageMCPListTools,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
+
+
+@json_schema_type
+class OpenAIResponseObject(BaseModel):
+    created_at: int
+    error: OpenAIResponseError | None = None
+    id: str
+    model: str
+    object: Literal["response"] = "response"
+    output: list[OpenAIResponseOutput]
+    parallel_tool_calls: bool = False
+    previous_response_id: str | None = None
+    status: str
+    temperature: float | None = None
+    top_p: float | None = None
+    truncation: str | None = None
+    user: str | None = None
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCreated(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.created"] = "response.created"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
+    content_index: int
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.output_text.delta"] = "response.output_text.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.completed"] = "response.completed"
+
+
+OpenAIResponseObjectStream = Annotated[
+    OpenAIResponseObjectStreamResponseCreated
+    | OpenAIResponseObjectStreamResponseOutputTextDelta
+    | OpenAIResponseObjectStreamResponseCompleted,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
+
+
+@json_schema_type
+class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
+    """
+    This represents the output of a function call that gets passed back to the model.
+    """
+
+    call_id: str
+    output: str
+    type: Literal["function_call_output"] = "function_call_output"
+    id: str | None = None
+    status: str | None = None
+
+
+OpenAIResponseInput = Annotated[
+    # Responses API allows output messages to be passed in as input
+    OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFunctionToolCall
+    | OpenAIResponseInputFunctionToolCallOutput
+    |
+    # Fallback to the generic message type as a last resort
+    OpenAIResponseMessage,
+    Field(union_mode="left_to_right"),
+]
+register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
+
+
+@json_schema_type
+class OpenAIResponseInputToolWebSearch(BaseModel):
+    type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
+    # TODO: actually use search_context_size somewhere...
+    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
+    # TODO: add user_location
+
+
+@json_schema_type
+class OpenAIResponseInputToolFunction(BaseModel):
+    type: Literal["function"] = "function"
+    name: str
+    description: str | None = None
+    parameters: dict[str, Any] | None
+    strict: bool | None = None
+
+
+class FileSearchRankingOptions(BaseModel):
+    ranker: str | None = None
+    score_threshold: float | None = Field(default=0.0, ge=0.0, le=1.0)
+
+
+@json_schema_type
+class OpenAIResponseInputToolFileSearch(BaseModel):
+    type: Literal["file_search"] = "file_search"
+    vector_store_id: list[str]
+    ranking_options: FileSearchRankingOptions | None = None
+    # TODO: add filters
+
+
+class ApprovalFilter(BaseModel):
+    always: list[str] | None = None
+    never: list[str] | None = None
+
+
+class AllowedToolsFilter(BaseModel):
+    tool_names: list[str] | None = None
+
+
+@json_schema_type
+class OpenAIResponseInputToolMCP(BaseModel):
+    type: Literal["mcp"] = "mcp"
+    server_label: str
+    server_url: str
+    headers: dict[str, Any] | None = None
+
+    require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
+    allowed_tools: list[str] | AllowedToolsFilter | None = None
+
+
+OpenAIResponseInputTool = Annotated[
+    OpenAIResponseInputToolWebSearch
+    | OpenAIResponseInputToolFileSearch
+    | OpenAIResponseInputToolFunction
+    | OpenAIResponseInputToolMCP,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
+
+
+class ListOpenAIResponseInputItem(BaseModel):
+    data: list[OpenAIResponseInput]
+    object: Literal["list"] = "list"
+
+
+@json_schema_type
+class OpenAIResponseObjectWithInput(OpenAIResponseObject):
+    input: list[OpenAIResponseInput]
+
+
+@json_schema_type
+class ListOpenAIResponseObject(BaseModel):
+    data: list[OpenAIResponseObjectWithInput]
+    has_more: bool
+    first_id: str
+    last_id: str
+    object: Literal["list"] = "list"
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index 7a324128d..b2aa637e2 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Optional, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 
 from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import (
@@ -34,22 +34,45 @@ class BatchInference(Protocol):
     async def completion(
         self,
         model: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Job: ...
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
+    ) -> Job:
+        """Generate completions for a batch of content.
+
+        :param model: The model to use for the completion.
+        :param content_batch: The content to complete.
+        :param sampling_params: The sampling parameters to use for the completion.
+        :param response_format: The response format to use for the completion.
+        :param logprobs: The logprobs to use for the completion.
+        :returns: A job for the completion.
+        """
+        ...
 
     @webmethod(route="/batch-inference/chat-completion", method="POST")
     async def chat_completion(
         self,
         model: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
         # zero-shot tool definitions as input to the model
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Job: ...
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
+    ) -> Job:
+        """Generate chat completions for a batch of messages.
+
+        :param model: The model to use for the chat completion.
+        :param messages_batch: The messages to complete.
+        :param sampling_params: The sampling parameters to use for the completion.
+        :param tools: The tools to use for the chat completion.
+        :param tool_choice: The tool choice to use for the chat completion.
+        :param tool_prompt_format: The tool prompt format to use for the chat completion.
+        :param response_format: The response format to use for the chat completion.
+        :param logprobs: The logprobs to use for the chat completion.
+        :returns: A job for the chat completion.
+        """
+        ...
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 809af8868..d80c767f8 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 
 from pydantic import BaseModel, Field
 
@@ -13,8 +13,8 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 class CommonBenchmarkFields(BaseModel):
     dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
+    scoring_functions: list[str]
+    metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Metadata for this evaluation task",
     )
@@ -22,45 +22,66 @@ class CommonBenchmarkFields(BaseModel):
 
 @json_schema_type
 class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
 
     @property
     def benchmark_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_benchmark_id(self) -> str:
+    def provider_benchmark_id(self) -> str | None:
         return self.provider_resource_id
 
 
 class BenchmarkInput(CommonBenchmarkFields, BaseModel):
     benchmark_id: str
-    provider_id: Optional[str] = None
-    provider_benchmark_id: Optional[str] = None
+    provider_id: str | None = None
+    provider_benchmark_id: str | None = None
 
 
 class ListBenchmarksResponse(BaseModel):
-    data: List[Benchmark]
+    data: list[Benchmark]
 
 
 @runtime_checkable
 class Benchmarks(Protocol):
     @webmethod(route="/eval/benchmarks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        """List all benchmarks.
+
+        :returns: A ListBenchmarksResponse.
+        """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
     async def get_benchmark(
         self,
         benchmark_id: str,
-    ) -> Benchmark: ...
+    ) -> Benchmark:
+        """Get a benchmark by its ID.
+
+        :param benchmark_id: The ID of the benchmark to get.
+        :returns: A Benchmark.
+        """
+        ...
 
     @webmethod(route="/eval/benchmarks", method="POST")
     async def register_benchmark(
         self,
         benchmark_id: str,
         dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
+        scoring_functions: list[str],
+        provider_benchmark_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Register a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to register.
+        :param dataset_id: The ID of the dataset to use for the benchmark.
+        :param scoring_functions: The scoring functions to use for the benchmark.
+        :param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
+        :param provider_id: The ID of the provider to use for the benchmark.
+        :param metadata: The metadata to use for the benchmark.
+        """
+        ...
diff --git a/llama_stack/apis/common/content_types.py b/llama_stack/apis/common/content_types.py
index 9d4e21308..8bcb781f7 100644
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Annotated, List, Literal, Optional, Union
+from typing import Annotated, Literal
 
 from pydantic import BaseModel, Field, model_validator
 
@@ -26,9 +26,9 @@ class _URLOrData(BaseModel):
     :param data: base64 encoded image data as string
     """
 
-    url: Optional[URL] = None
+    url: URL | None = None
     # data is a base64 encoded string, hint with contentEncoding=base64
-    data: Optional[str] = Field(contentEncoding="base64", default=None)
+    data: str | None = Field(default=None, json_schema_extra={"contentEncoding": "base64"})
 
     @model_validator(mode="before")
     @classmethod
@@ -64,13 +64,13 @@ class TextContentItem(BaseModel):
 
 # other modalities can be added here
 InterleavedContentItem = Annotated[
-    Union[ImageContentItem, TextContentItem],
+    ImageContentItem | TextContentItem,
     Field(discriminator="type"),
 ]
 register_schema(InterleavedContentItem, name="InterleavedContentItem")
 
 # accept a single "str" as a special case since it is common
-InterleavedContent = Union[str, InterleavedContentItem, List[InterleavedContentItem]]
+InterleavedContent = str | InterleavedContentItem | list[InterleavedContentItem]
 register_schema(InterleavedContent, name="InterleavedContent")
 
 
@@ -100,13 +100,13 @@ class ToolCallDelta(BaseModel):
     # you either send an in-progress tool call so the client can stream a long
     # code generation or you send the final parsed tool call at the end of the
     # stream
-    tool_call: Union[str, ToolCall]
+    tool_call: str | ToolCall
     parse_status: ToolCallParseStatus
 
 
 # streaming completions send a stream of ContentDeltas
 ContentDelta = Annotated[
-    Union[TextDelta, ImageDelta, ToolCallDelta],
+    TextDelta | ImageDelta | ToolCallDelta,
     Field(discriminator="type"),
 ]
 register_schema(ContentDelta, name="ContentDelta")
diff --git a/llama_stack/apis/common/deployment_types.py b/llama_stack/apis/common/deployment_types.py
deleted file mode 100644
index 83eea28a2..000000000
--- a/llama_stack/apis/common/deployment_types.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Dict, Optional
-
-from pydantic import BaseModel
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class RestAPIMethod(Enum):
-    GET = "GET"
-    POST = "POST"
-    PUT = "PUT"
-    DELETE = "DELETE"
-
-
-@json_schema_type
-class RestAPIExecutionConfig(BaseModel):
-    url: URL
-    method: RestAPIMethod
-    params: Optional[Dict[str, Any]] = None
-    headers: Optional[Dict[str, Any]] = None
-    body: Optional[Dict[str, Any]] = None
diff --git a/llama_stack/apis/common/responses.py b/llama_stack/apis/common/responses.py
index f9e9a4c31..5cb41e23d 100644
--- a/llama_stack/apis/common/responses.py
+++ b/llama_stack/apis/common/responses.py
@@ -4,13 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List
+from enum import Enum
+from typing import Any
 
 from pydantic import BaseModel
 
 from llama_stack.schema_utils import json_schema_type
 
 
+class Order(Enum):
+    asc = "asc"
+    desc = "desc"
+
+
 @json_schema_type
 class PaginatedResponse(BaseModel):
     """A generic paginated response that follows a simple format.
@@ -19,5 +25,5 @@ class PaginatedResponse(BaseModel):
     :param has_more: Whether there are more items available after this set
     """
 
-    data: List[Dict[str, Any]]
+    data: list[dict[str, Any]]
     has_more: bool
diff --git a/llama_stack/apis/common/training_types.py b/llama_stack/apis/common/training_types.py
index d6c6c6919..46cd101af 100644
--- a/llama_stack/apis/common/training_types.py
+++ b/llama_stack/apis/common/training_types.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from datetime import datetime
-from typing import Optional
 
 from pydantic import BaseModel
 
@@ -27,4 +26,4 @@ class Checkpoint(BaseModel):
     epoch: int
     post_training_job_id: str
     path: str
-    training_metrics: Optional[PostTrainingMetric] = None
+    training_metrics: PostTrainingMetric | None = None
diff --git a/llama_stack/apis/common/type_system.py b/llama_stack/apis/common/type_system.py
index 5d9f000be..db4aab4c5 100644
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@@ -4,10 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Literal, Union
+from typing import Annotated, Literal
 
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated
 
 from llama_stack.schema_utils import json_schema_type, register_schema
 
@@ -73,18 +72,16 @@ class DialogType(BaseModel):
 
 
 ParamType = Annotated[
-    Union[
-        StringType,
-        NumberType,
-        BooleanType,
-        ArrayType,
-        ObjectType,
-        JsonType,
-        UnionType,
-        ChatCompletionInputType,
-        CompletionInputType,
-        AgentTurnInputType,
-    ],
+    StringType
+    | NumberType
+    | BooleanType
+    | ArrayType
+    | ObjectType
+    | JsonType
+    | UnionType
+    | ChatCompletionInputType
+    | CompletionInputType
+    | AgentTurnInputType,
     Field(discriminator="type"),
 ]
 register_schema(ParamType, name="ParamType")
diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py
index 6331882fb..1183983cc 100644
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
@@ -24,8 +24,8 @@ class DatasetIO(Protocol):
     async def iterrows(
         self,
         dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
+        start_index: int | None = None,
+        limit: int | None = None,
     ) -> PaginatedResponse:
         """Get a paginated list of rows from a dataset.
 
@@ -34,14 +34,21 @@ class DatasetIO(Protocol):
         - limit: Number of items to return. If None or -1, returns all items.
 
         The response includes:
-        - data: List of items for the current page
-        - has_more: Whether there are more items available after this set
+        - data: List of items for the current page.
+        - has_more: Whether there are more items available after this set.
 
         :param dataset_id: The ID of the dataset to get the rows from.
         :param start_index: Index into dataset for the first row to get. Get all rows if None.
         :param limit: The number of rows to get.
+        :returns: A PaginatedResponse.
         """
         ...
 
     @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        """Append rows to a dataset.
+
+        :param dataset_id: The ID of the dataset to append the rows to.
+        :param rows: The rows to append to the dataset.
+        """
+        ...
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 32ccde144..e3de3d5cb 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Annotated, Any, Dict, List, Literal, Optional, Protocol, Union
+from typing import Annotated, Any, Literal, Protocol
 
 from pydantic import BaseModel, Field
 
@@ -81,11 +81,11 @@ class RowsDataSource(BaseModel):
     """
 
     type: Literal["rows"] = "rows"
-    rows: List[Dict[str, Any]]
+    rows: list[dict[str, Any]]
 
 
 DataSource = Annotated[
-    Union[URIDataSource, RowsDataSource],
+    URIDataSource | RowsDataSource,
     Field(discriminator="type"),
 ]
 register_schema(DataSource, name="DataSource")
@@ -98,7 +98,7 @@ class CommonDatasetFields(BaseModel):
 
     purpose: DatasetPurpose
     source: DataSource
-    metadata: Dict[str, Any] = Field(
+    metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Any additional metadata for this dataset",
     )
@@ -106,14 +106,14 @@ class CommonDatasetFields(BaseModel):
 
 @json_schema_type
 class Dataset(CommonDatasetFields, Resource):
-    type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value
+    type: Literal[ResourceType.dataset] = ResourceType.dataset
 
     @property
     def dataset_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_dataset_id(self) -> str:
+    def provider_dataset_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -122,7 +122,7 @@ class DatasetInput(CommonDatasetFields, BaseModel):
 
 
 class ListDatasetsResponse(BaseModel):
-    data: List[Dataset]
+    data: list[Dataset]
 
 
 class Datasets(Protocol):
@@ -131,13 +131,14 @@ class Datasets(Protocol):
         self,
         purpose: DatasetPurpose,
         source: DataSource,
-        metadata: Optional[Dict[str, Any]] = None,
-        dataset_id: Optional[str] = None,
+        metadata: dict[str, Any] | None = None,
+        dataset_id: str | None = None,
     ) -> Dataset:
         """
         Register a new dataset.
 
-        :param purpose: The purpose of the dataset. One of
+        :param purpose: The purpose of the dataset.
+        One of:
             - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
                 {
                     "messages": [
@@ -188,8 +189,9 @@ class Datasets(Protocol):
                ]
            }
         :param metadata: The metadata for the dataset.
-           - E.g. {"description": "My dataset"}
+           - E.g. {"description": "My dataset"}.
         :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
+        :returns: A Dataset.
         """
         ...
 
@@ -197,13 +199,29 @@ class Datasets(Protocol):
     async def get_dataset(
         self,
         dataset_id: str,
-    ) -> Dataset: ...
+    ) -> Dataset:
+        """Get a dataset by its ID.
+
+        :param dataset_id: The ID of the dataset to get.
+        :returns: A Dataset.
+        """
+        ...
 
     @webmethod(route="/datasets", method="GET")
-    async def list_datasets(self) -> ListDatasetsResponse: ...
+    async def list_datasets(self) -> ListDatasetsResponse:
+        """List all datasets.
+
+        :returns: A ListDatasetsResponse.
+        """
+        ...
 
     @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
     async def unregister_dataset(
         self,
         dataset_id: str,
-    ) -> None: ...
+    ) -> None:
+        """Unregister a dataset by its ID.
+
+        :param dataset_id: The ID of the dataset to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 25f3ab1ab..63a764725 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Optional
 
 from pydantic import BaseModel
 
@@ -54,4 +53,4 @@ class Error(BaseModel):
     status: int
     title: str
     detail: str
-    instance: Optional[str] = None
+    instance: str | None = None
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 0e5959c37..83a0a8e56 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -4,10 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+from typing import Annotated, Any, Literal, Protocol
 
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated
 
 from llama_stack.apis.agents import AgentConfig
 from llama_stack.apis.common.job_types import Job
@@ -29,7 +28,7 @@ class ModelCandidate(BaseModel):
     type: Literal["model"] = "model"
     model: str
     sampling_params: SamplingParams
-    system_message: Optional[SystemMessage] = None
+    system_message: SystemMessage | None = None
 
 
 @json_schema_type
@@ -43,7 +42,7 @@ class AgentCandidate(BaseModel):
     config: AgentConfig
 
 
-EvalCandidate = Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")]
+EvalCandidate = Annotated[ModelCandidate | AgentCandidate, Field(discriminator="type")]
 register_schema(EvalCandidate, name="EvalCandidate")
 
 
@@ -57,11 +56,11 @@ class BenchmarkConfig(BaseModel):
     """
 
     eval_candidate: EvalCandidate
-    scoring_params: Dict[str, ScoringFnParams] = Field(
+    scoring_params: dict[str, ScoringFnParams] = Field(
         description="Map between scoring function id and parameters for each scoring function you want to run",
         default_factory=dict,
     )
-    num_examples: Optional[int] = Field(
+    num_examples: int | None = Field(
         description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
         default=None,
     )
@@ -76,9 +75,9 @@ class EvaluateResponse(BaseModel):
     :param scores: The scores from the evaluation.
     """
 
-    generations: List[Dict[str, Any]]
+    generations: list[dict[str, Any]]
     # each key in the dict is a scoring function name
-    scores: Dict[str, ScoringResult]
+    scores: dict[str, ScoringResult]
 
 
 class Eval(Protocol):
@@ -94,15 +93,16 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param benchmark_config: The configuration for the benchmark.
-        :return: The job that was created to run the evaluation.
+        :returns: The job that was created to run the evaluation.
         """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
     async def evaluate_rows(
         self,
         benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: list[str],
         benchmark_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         """Evaluate a list of rows on a benchmark.
@@ -111,8 +111,9 @@ class Eval(Protocol):
         :param input_rows: The rows to evaluate.
         :param scoring_functions: The scoring functions to use for the evaluation.
         :param benchmark_config: The configuration for the benchmark.
-        :return: EvaluateResponse object containing generations and scores
+        :returns: EvaluateResponse object containing generations and scores.
         """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
     async def job_status(self, benchmark_id: str, job_id: str) -> Job:
@@ -120,7 +121,7 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param job_id: The ID of the job to get the status of.
-        :return: The status of the evaluationjob.
+        :returns: The status of the evaluation job.
         """
         ...
 
@@ -139,5 +140,6 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param job_id: The ID of the job to get the result of.
-        :return: The result of the job.
+        :returns: The result of the job.
         """
+        ...
diff --git a/llama_stack/apis/files/files.py b/llama_stack/apis/files/files.py
index ef8b65829..1d762a68a 100644
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Optional, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -42,7 +42,7 @@ class ListBucketResponse(BaseModel):
     :param data: List of FileResponse entries
     """
 
-    data: List[BucketResponse]
+    data: list[BucketResponse]
 
 
 @json_schema_type
@@ -74,7 +74,7 @@ class ListFileResponse(BaseModel):
     :param data: List of FileResponse entries
     """
 
-    data: List[FileResponse]
+    data: list[FileResponse]
 
 
 @runtime_checkable
@@ -91,10 +91,11 @@ class Files(Protocol):
         """
         Create a new upload session for a file identified by a bucket and key.
 
-        :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-        :param mime_type: MIME type of the file
-        :param size: File size in bytes
+        :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
+        :param mime_type: MIME type of the file.
+        :param size: File size in bytes.
+        :returns: A FileUploadResponse.
         """
         ...
 
@@ -102,12 +103,13 @@ class Files(Protocol):
     async def upload_content_to_session(
         self,
         upload_id: str,
-    ) -> Optional[FileResponse]:
+    ) -> FileResponse | None:
         """
         Upload file content to an existing upload session.
         On the server, request body will have the raw bytes that are uploaded.
 
-        :param upload_id: ID of the upload session
+        :param upload_id: ID of the upload session.
+        :returns: A FileResponse or None if the upload is not complete.
         """
         ...
 
@@ -117,9 +119,10 @@ class Files(Protocol):
         upload_id: str,
     ) -> FileUploadResponse:
         """
-        Returns information about an existsing upload session
+        Returns information about an existsing upload session.
 
-        :param upload_id: ID of the upload session
+        :param upload_id: ID of the upload session.
+        :returns: A FileUploadResponse.
         """
         ...
 
@@ -130,6 +133,9 @@ class Files(Protocol):
     ) -> ListBucketResponse:
         """
         List all buckets.
+
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :returns: A ListBucketResponse.
         """
         ...
 
@@ -141,7 +147,8 @@ class Files(Protocol):
         """
         List all files in a bucket.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :returns: A ListFileResponse.
         """
         ...
 
@@ -154,8 +161,9 @@ class Files(Protocol):
         """
         Get a file info identified by a bucket and key.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
+        :returns: A FileResponse.
         """
         ...
 
@@ -168,7 +176,7 @@ class Files(Protocol):
         """
         Delete a file identified by a bucket and key.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
         """
         ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 309171f20..74697dd18 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -4,23 +4,22 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
+from collections.abc import AsyncIterator
 from enum import Enum
 from typing import (
+    Annotated,
     Any,
-    AsyncIterator,
-    Dict,
-    List,
     Literal,
-    Optional,
     Protocol,
-    Union,
     runtime_checkable,
 )
 
 from pydantic import BaseModel, Field, field_validator
-from typing_extensions import Annotated, TypedDict
+from typing_extensions import TypedDict
 
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
+from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
 from llama_stack.models.llama.datatypes import (
@@ -38,6 +37,16 @@ register_schema(ToolCall)
 register_schema(ToolParamDefinition)
 register_schema(ToolDefinition)
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
 
 @json_schema_type
 class GreedySamplingStrategy(BaseModel):
@@ -47,8 +56,8 @@ class GreedySamplingStrategy(BaseModel):
 @json_schema_type
 class TopPSamplingStrategy(BaseModel):
     type: Literal["top_p"] = "top_p"
-    temperature: Optional[float] = Field(..., gt=0.0)
-    top_p: Optional[float] = 0.95
+    temperature: float | None = Field(..., gt=0.0)
+    top_p: float | None = 0.95
 
 
 @json_schema_type
@@ -58,7 +67,7 @@ class TopKSamplingStrategy(BaseModel):
 
 
 SamplingStrategy = Annotated[
-    Union[GreedySamplingStrategy, TopPSamplingStrategy, TopKSamplingStrategy],
+    GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy,
     Field(discriminator="type"),
 ]
 register_schema(SamplingStrategy, name="SamplingStrategy")
@@ -79,9 +88,9 @@ class SamplingParams(BaseModel):
 
     strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
 
-    max_tokens: Optional[int] = 0
-    repetition_penalty: Optional[float] = 1.0
-    stop: Optional[List[str]] = None
+    max_tokens: int | None = 0
+    repetition_penalty: float | None = 1.0
+    stop: list[str] | None = None
 
 
 class LogProbConfig(BaseModel):
@@ -90,7 +99,7 @@ class LogProbConfig(BaseModel):
     :param top_k: How many tokens (for each position) to return log probabilities for.
     """
 
-    top_k: Optional[int] = 0
+    top_k: int | None = 0
 
 
 class QuantizationType(Enum):
@@ -125,11 +134,11 @@ class Int4QuantizationConfig(BaseModel):
     """
 
     type: Literal["int4_mixed"] = "int4_mixed"
-    scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
+    scheme: str | None = "int4_weight_int8_dynamic_activation"
 
 
 QuantizationConfig = Annotated[
-    Union[Bf16QuantizationConfig, Fp8QuantizationConfig, Int4QuantizationConfig],
+    Bf16QuantizationConfig | Fp8QuantizationConfig | Int4QuantizationConfig,
     Field(discriminator="type"),
 ]
 
@@ -145,7 +154,7 @@ class UserMessage(BaseModel):
 
     role: Literal["user"] = "user"
     content: InterleavedContent
-    context: Optional[InterleavedContent] = None
+    context: InterleavedContent | None = None
 
 
 @json_schema_type
@@ -190,16 +199,11 @@ class CompletionMessage(BaseModel):
     role: Literal["assistant"] = "assistant"
     content: InterleavedContent
     stop_reason: StopReason
-    tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
+    tool_calls: list[ToolCall] | None = Field(default_factory=lambda: [])
 
 
 Message = Annotated[
-    Union[
-        UserMessage,
-        SystemMessage,
-        ToolResponseMessage,
-        CompletionMessage,
-    ],
+    UserMessage | SystemMessage | ToolResponseMessage | CompletionMessage,
     Field(discriminator="role"),
 ]
 register_schema(Message, name="Message")
@@ -208,9 +212,9 @@ register_schema(Message, name="Message")
 @json_schema_type
 class ToolResponse(BaseModel):
     call_id: str
-    tool_name: Union[BuiltinTool, str]
+    tool_name: BuiltinTool | str
     content: InterleavedContent
-    metadata: Optional[Dict[str, Any]] = None
+    metadata: dict[str, Any] | None = None
 
     @field_validator("tool_name", mode="before")
     @classmethod
@@ -243,7 +247,7 @@ class TokenLogProbs(BaseModel):
     :param logprobs_by_token: Dictionary mapping tokens to their log probabilities
     """
 
-    logprobs_by_token: Dict[str, float]
+    logprobs_by_token: dict[str, float]
 
 
 class ChatCompletionResponseEventType(Enum):
@@ -271,11 +275,11 @@ class ChatCompletionResponseEvent(BaseModel):
 
     event_type: ChatCompletionResponseEventType
     delta: ContentDelta
-    logprobs: Optional[List[TokenLogProbs]] = None
-    stop_reason: Optional[StopReason] = None
+    logprobs: list[TokenLogProbs] | None = None
+    stop_reason: StopReason | None = None
 
 
-class ResponseFormatType(Enum):
+class ResponseFormatType(StrEnum):
     """Types of formats for structured (guided) decoding.
 
     :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
@@ -294,8 +298,8 @@ class JsonSchemaResponseFormat(BaseModel):
     :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
     """
 
-    type: Literal[ResponseFormatType.json_schema.value] = ResponseFormatType.json_schema.value
-    json_schema: Dict[str, Any]
+    type: Literal[ResponseFormatType.json_schema] = ResponseFormatType.json_schema
+    json_schema: dict[str, Any]
 
 
 @json_schema_type
@@ -306,12 +310,12 @@ class GrammarResponseFormat(BaseModel):
     :param bnf: The BNF grammar specification the response should conform to
     """
 
-    type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
-    bnf: Dict[str, Any]
+    type: Literal[ResponseFormatType.grammar] = ResponseFormatType.grammar
+    bnf: dict[str, Any]
 
 
 ResponseFormat = Annotated[
-    Union[JsonSchemaResponseFormat, GrammarResponseFormat],
+    JsonSchemaResponseFormat | GrammarResponseFormat,
     Field(discriminator="type"),
 ]
 register_schema(ResponseFormat, name="ResponseFormat")
@@ -321,10 +325,10 @@ register_schema(ResponseFormat, name="ResponseFormat")
 class CompletionRequest(BaseModel):
     model: str
     content: InterleavedContent
-    sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
-    response_format: Optional[ResponseFormat] = None
-    stream: Optional[bool] = False
-    logprobs: Optional[LogProbConfig] = None
+    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
+    response_format: ResponseFormat | None = None
+    stream: bool | None = False
+    logprobs: LogProbConfig | None = None
 
 
 @json_schema_type
@@ -338,7 +342,7 @@ class CompletionResponse(MetricResponseMixin):
 
     content: str
     stop_reason: StopReason
-    logprobs: Optional[List[TokenLogProbs]] = None
+    logprobs: list[TokenLogProbs] | None = None
 
 
 @json_schema_type
@@ -351,8 +355,8 @@ class CompletionResponseStreamChunk(MetricResponseMixin):
     """
 
     delta: str
-    stop_reason: Optional[StopReason] = None
-    logprobs: Optional[List[TokenLogProbs]] = None
+    stop_reason: StopReason | None = None
+    logprobs: list[TokenLogProbs] | None = None
 
 
 class SystemMessageBehavior(Enum):
@@ -383,9 +387,9 @@ class ToolConfig(BaseModel):
             '{{function_definitions}}' to indicate where the function definitions should be inserted.
     """
 
-    tool_choice: Optional[ToolChoice | str] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    system_message_behavior: Optional[SystemMessageBehavior] = Field(default=SystemMessageBehavior.append)
+    tool_choice: ToolChoice | str | None = Field(default=ToolChoice.auto)
+    tool_prompt_format: ToolPromptFormat | None = Field(default=None)
+    system_message_behavior: SystemMessageBehavior | None = Field(default=SystemMessageBehavior.append)
 
     def model_post_init(self, __context: Any) -> None:
         if isinstance(self.tool_choice, str):
@@ -399,15 +403,15 @@ class ToolConfig(BaseModel):
 @json_schema_type
 class ChatCompletionRequest(BaseModel):
     model: str
-    messages: List[Message]
-    sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
+    messages: list[Message]
+    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
 
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig)
+    tools: list[ToolDefinition] | None = Field(default_factory=lambda: [])
+    tool_config: ToolConfig | None = Field(default_factory=ToolConfig)
 
-    response_format: Optional[ResponseFormat] = None
-    stream: Optional[bool] = False
-    logprobs: Optional[LogProbConfig] = None
+    response_format: ResponseFormat | None = None
+    stream: bool | None = False
+    logprobs: LogProbConfig | None = None
 
 
 @json_schema_type
@@ -429,7 +433,7 @@ class ChatCompletionResponse(MetricResponseMixin):
     """
 
     completion_message: CompletionMessage
-    logprobs: Optional[List[TokenLogProbs]] = None
+    logprobs: list[TokenLogProbs] | None = None
 
 
 @json_schema_type
@@ -439,7 +443,7 @@ class EmbeddingsResponse(BaseModel):
     :param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
     """
 
-    embeddings: List[List[float]]
+    embeddings: list[list[float]]
 
 
 @json_schema_type
@@ -451,7 +455,7 @@ class OpenAIChatCompletionContentPartTextParam(BaseModel):
 @json_schema_type
 class OpenAIImageURL(BaseModel):
     url: str
-    detail: Optional[str] = None
+    detail: str | None = None
 
 
 @json_schema_type
@@ -461,16 +465,13 @@ class OpenAIChatCompletionContentPartImageParam(BaseModel):
 
 
 OpenAIChatCompletionContentPartParam = Annotated[
-    Union[
-        OpenAIChatCompletionContentPartTextParam,
-        OpenAIChatCompletionContentPartImageParam,
-    ],
+    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
 
 
-OpenAIChatCompletionMessageContent = Union[str, List[OpenAIChatCompletionContentPartParam]]
+OpenAIChatCompletionMessageContent = str | list[OpenAIChatCompletionContentPartParam]
 
 
 @json_schema_type
@@ -484,7 +485,7 @@ class OpenAIUserMessageParam(BaseModel):
 
     role: Literal["user"] = "user"
     content: OpenAIChatCompletionMessageContent
-    name: Optional[str] = None
+    name: str | None = None
 
 
 @json_schema_type
@@ -498,21 +499,21 @@ class OpenAISystemMessageParam(BaseModel):
 
     role: Literal["system"] = "system"
     content: OpenAIChatCompletionMessageContent
-    name: Optional[str] = None
+    name: str | None = None
 
 
 @json_schema_type
 class OpenAIChatCompletionToolCallFunction(BaseModel):
-    name: Optional[str] = None
-    arguments: Optional[str] = None
+    name: str | None = None
+    arguments: str | None = None
 
 
 @json_schema_type
 class OpenAIChatCompletionToolCall(BaseModel):
-    index: Optional[int] = None
-    id: Optional[str] = None
+    index: int | None = None
+    id: str | None = None
     type: Literal["function"] = "function"
-    function: Optional[OpenAIChatCompletionToolCallFunction] = None
+    function: OpenAIChatCompletionToolCallFunction | None = None
 
 
 @json_schema_type
@@ -526,9 +527,9 @@ class OpenAIAssistantMessageParam(BaseModel):
     """
 
     role: Literal["assistant"] = "assistant"
-    content: Optional[OpenAIChatCompletionMessageContent] = None
-    name: Optional[str] = None
-    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
+    content: OpenAIChatCompletionMessageContent | None = None
+    name: str | None = None
+    tool_calls: list[OpenAIChatCompletionToolCall] | None = None
 
 
 @json_schema_type
@@ -556,17 +557,15 @@ class OpenAIDeveloperMessageParam(BaseModel):
 
     role: Literal["developer"] = "developer"
     content: OpenAIChatCompletionMessageContent
-    name: Optional[str] = None
+    name: str | None = None
 
 
 OpenAIMessageParam = Annotated[
-    Union[
-        OpenAIUserMessageParam,
-        OpenAISystemMessageParam,
-        OpenAIAssistantMessageParam,
-        OpenAIToolMessageParam,
-        OpenAIDeveloperMessageParam,
-    ],
+    OpenAIUserMessageParam
+    | OpenAISystemMessageParam
+    | OpenAIAssistantMessageParam
+    | OpenAIToolMessageParam
+    | OpenAIDeveloperMessageParam,
     Field(discriminator="role"),
 ]
 register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
@@ -580,14 +579,14 @@ class OpenAIResponseFormatText(BaseModel):
 @json_schema_type
 class OpenAIJSONSchema(TypedDict, total=False):
     name: str
-    description: Optional[str] = None
-    strict: Optional[bool] = None
+    description: str | None
+    strict: bool | None
 
     # Pydantic BaseModel cannot be used with a schema param, since it already
     # has one. And, we don't want to alias here because then have to handle
     # that alias when converting to OpenAI params. So, to support schema,
     # we use a TypedDict.
-    schema: Optional[Dict[str, Any]] = None
+    schema: dict[str, Any] | None
 
 
 @json_schema_type
@@ -602,11 +601,7 @@ class OpenAIResponseFormatJSONObject(BaseModel):
 
 
 OpenAIResponseFormatParam = Annotated[
-    Union[
-        OpenAIResponseFormatText,
-        OpenAIResponseFormatJSONSchema,
-        OpenAIResponseFormatJSONObject,
-    ],
+    OpenAIResponseFormatText | OpenAIResponseFormatJSONSchema | OpenAIResponseFormatJSONObject,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseFormatParam, name="OpenAIResponseFormatParam")
@@ -622,7 +617,7 @@ class OpenAITopLogProb(BaseModel):
     """
 
     token: str
-    bytes: Optional[List[int]] = None
+    bytes: list[int] | None = None
     logprob: float
 
 
@@ -637,9 +632,9 @@ class OpenAITokenLogProb(BaseModel):
     """
 
     token: str
-    bytes: Optional[List[int]] = None
+    bytes: list[int] | None = None
     logprob: float
-    top_logprobs: List[OpenAITopLogProb]
+    top_logprobs: list[OpenAITopLogProb]
 
 
 @json_schema_type
@@ -650,8 +645,8 @@ class OpenAIChoiceLogprobs(BaseModel):
     :param refusal: (Optional) The log probabilities for the tokens in the message
     """
 
-    content: Optional[List[OpenAITokenLogProb]] = None
-    refusal: Optional[List[OpenAITokenLogProb]] = None
+    content: list[OpenAITokenLogProb] | None = None
+    refusal: list[OpenAITokenLogProb] | None = None
 
 
 @json_schema_type
@@ -664,10 +659,10 @@ class OpenAIChoiceDelta(BaseModel):
     :param tool_calls: (Optional) The tool calls of the delta
     """
 
-    content: Optional[str] = None
-    refusal: Optional[str] = None
-    role: Optional[str] = None
-    tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
+    content: str | None = None
+    refusal: str | None = None
+    role: str | None = None
+    tool_calls: list[OpenAIChatCompletionToolCall] | None = None
 
 
 @json_schema_type
@@ -683,7 +678,7 @@ class OpenAIChunkChoice(BaseModel):
     delta: OpenAIChoiceDelta
     finish_reason: str
     index: int
-    logprobs: Optional[OpenAIChoiceLogprobs] = None
+    logprobs: OpenAIChoiceLogprobs | None = None
 
 
 @json_schema_type
@@ -699,7 +694,7 @@ class OpenAIChoice(BaseModel):
     message: OpenAIMessageParam
     finish_reason: str
     index: int
-    logprobs: Optional[OpenAIChoiceLogprobs] = None
+    logprobs: OpenAIChoiceLogprobs | None = None
 
 
 @json_schema_type
@@ -714,7 +709,7 @@ class OpenAIChatCompletion(BaseModel):
     """
 
     id: str
-    choices: List[OpenAIChoice]
+    choices: list[OpenAIChoice]
     object: Literal["chat.completion"] = "chat.completion"
     created: int
     model: str
@@ -732,7 +727,7 @@ class OpenAIChatCompletionChunk(BaseModel):
     """
 
     id: str
-    choices: List[OpenAIChunkChoice]
+    choices: list[OpenAIChunkChoice]
     object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     created: int
     model: str
@@ -748,10 +743,10 @@ class OpenAICompletionLogprobs(BaseModel):
     :top_logprobs: (Optional) The top log probabilities for the tokens
     """
 
-    text_offset: Optional[List[int]] = None
-    token_logprobs: Optional[List[float]] = None
-    tokens: Optional[List[str]] = None
-    top_logprobs: Optional[List[Dict[str, float]]] = None
+    text_offset: list[int] | None = None
+    token_logprobs: list[float] | None = None
+    tokens: list[str] | None = None
+    top_logprobs: list[dict[str, float]] | None = None
 
 
 @json_schema_type
@@ -767,7 +762,7 @@ class OpenAICompletionChoice(BaseModel):
     finish_reason: str
     text: str
     index: int
-    logprobs: Optional[OpenAIChoiceLogprobs] = None
+    logprobs: OpenAIChoiceLogprobs | None = None
 
 
 @json_schema_type
@@ -782,12 +777,54 @@ class OpenAICompletion(BaseModel):
     """
 
     id: str
-    choices: List[OpenAICompletionChoice]
+    choices: list[OpenAICompletionChoice]
     created: int
     model: str
     object: Literal["text_completion"] = "text_completion"
 
 
+@json_schema_type
+class OpenAIEmbeddingData(BaseModel):
+    """A single embedding data object from an OpenAI-compatible embeddings response.
+
+    :param object: The object type, which will be "embedding"
+    :param embedding: The embedding vector as a list of floats (when encoding_format="float") or as a base64-encoded string (when encoding_format="base64")
+    :param index: The index of the embedding in the input list
+    """
+
+    object: Literal["embedding"] = "embedding"
+    embedding: list[float] | str
+    index: int
+
+
+@json_schema_type
+class OpenAIEmbeddingUsage(BaseModel):
+    """Usage information for an OpenAI-compatible embeddings response.
+
+    :param prompt_tokens: The number of tokens in the input
+    :param total_tokens: The total number of tokens used
+    """
+
+    prompt_tokens: int
+    total_tokens: int
+
+
+@json_schema_type
+class OpenAIEmbeddingsResponse(BaseModel):
+    """Response from an OpenAI-compatible embeddings request.
+
+    :param object: The object type, which will be "list"
+    :param data: List of embedding data objects
+    :param model: The model that was used to generate the embeddings
+    :param usage: Usage information
+    """
+
+    object: Literal["list"] = "list"
+    data: list[OpenAIEmbeddingData]
+    model: str
+    usage: OpenAIEmbeddingUsage
+
+
 class ModelStore(Protocol):
     async def get_model(self, identifier: str) -> Model: ...
 
@@ -818,23 +855,35 @@ class EmbeddingTaskType(Enum):
 
 @json_schema_type
 class BatchCompletionResponse(BaseModel):
-    batch: List[CompletionResponse]
+    batch: list[CompletionResponse]
 
 
 @json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
+    batch: list[ChatCompletionResponse]
+
+
+class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
+    input_messages: list[OpenAIMessageParam]
+
+
+@json_schema_type
+class ListOpenAIChatCompletionResponse(BaseModel):
+    data: list[OpenAICompletionWithInputMessages]
+    has_more: bool
+    first_id: str
+    last_id: str
+    object: Literal["list"] = "list"
 
 
 @runtime_checkable
 @trace_protocol
-class Inference(Protocol):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
-
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
+class InferenceProvider(Protocol):
     """
+    This protocol defines the interface that should be implemented by all inference providers.
+    """
+
+    API_NAMESPACE: str = "Inference"
 
     model_store: ModelStore | None = None
 
@@ -843,21 +892,21 @@ class Inference(Protocol):
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
         """Generate a completion for the given content using the specified model.
 
         :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content: The content to generate a completion for
-        :param sampling_params: (Optional) Parameters to control the sampling strategy
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param content: The content to generate a completion for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
         :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
         :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
         :returns: If stream=False, returns a CompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
         """
         ...
 
@@ -865,33 +914,42 @@ class Inference(Protocol):
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ) -> BatchCompletionResponse:
+        """Generate completions for a batch of content using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param content_batch: The content to generate completions for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: A BatchCompletionResponse with the full completions.
+        """
         raise NotImplementedError("Batch completion is not implemented")
 
     @webmethod(route="/inference/chat-completion", method="POST")
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         """Generate a chat completion for the given messages using the specified model.
 
         :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation
-        :param sampling_params: Parameters to control the sampling strategy
-        :param tools: (Optional) List of tool definitions available to the model
+        :param messages: List of messages in the conversation.
+        :param sampling_params: Parameters to control the sampling strategy.
+        :param tools: (Optional) List of tool definitions available to the model.
         :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
             .. deprecated::
                Use tool_config instead.
@@ -908,7 +966,7 @@ class Inference(Protocol):
         :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
         :param tool_config: (Optional) Configuration for tool use.
         :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
         """
         ...
 
@@ -916,23 +974,34 @@ class Inference(Protocol):
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ) -> BatchChatCompletionResponse:
+        """Generate chat completions for a batch of messages using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param messages_batch: The messages to generate completions for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param tools: (Optional) List of tool definitions available to the model.
+        :param tool_config: (Optional) Configuration for tool use.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: A BatchChatCompletionResponse with the full completions.
+        """
         raise NotImplementedError("Batch chat completion is not implemented")
 
     @webmethod(route="/inference/embeddings", method="POST")
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         """Generate embeddings for content pieces using the specified model.
 
@@ -941,7 +1010,7 @@ class Inference(Protocol):
         :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
         :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
         :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
-        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
         """
         ...
 
@@ -950,45 +1019,46 @@ class Inference(Protocol):
         self,
         # Standard OpenAI completion parameters
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
         # vLLM-specific parameters
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         """Generate an OpenAI-compatible completion for the given prompt using the specified model.
 
         :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param prompt: The prompt to generate a completion for
-        :param best_of: (Optional) The number of completions to generate
-        :param echo: (Optional) Whether to echo the prompt
-        :param frequency_penalty: (Optional) The penalty for repeated tokens
-        :param logit_bias: (Optional) The logit bias to use
-        :param logprobs: (Optional) The log probabilities to use
-        :param max_tokens: (Optional) The maximum number of tokens to generate
-        :param n: (Optional) The number of completions to generate
-        :param presence_penalty: (Optional) The penalty for repeated tokens
-        :param seed: (Optional) The seed to use
-        :param stop: (Optional) The stop tokens to use
-        :param stream: (Optional) Whether to stream the response
-        :param stream_options: (Optional) The stream options to use
-        :param temperature: (Optional) The temperature to use
-        :param top_p: (Optional) The top p to use
-        :param user: (Optional) The user to use
+        :param prompt: The prompt to generate a completion for.
+        :param best_of: (Optional) The number of completions to generate.
+        :param echo: (Optional) Whether to echo the prompt.
+        :param frequency_penalty: (Optional) The penalty for repeated tokens.
+        :param logit_bias: (Optional) The logit bias to use.
+        :param logprobs: (Optional) The log probabilities to use.
+        :param max_tokens: (Optional) The maximum number of tokens to generate.
+        :param n: (Optional) The number of completions to generate.
+        :param presence_penalty: (Optional) The penalty for repeated tokens.
+        :param seed: (Optional) The seed to use.
+        :param stop: (Optional) The stop tokens to use.
+        :param stream: (Optional) Whether to stream the response.
+        :param stream_options: (Optional) The stream options to use.
+        :param temperature: (Optional) The temperature to use.
+        :param top_p: (Optional) The top p to use.
+        :param user: (Optional) The user to use.
+        :returns: An OpenAICompletion.
         """
         ...
 
@@ -996,53 +1066,110 @@ class Inference(Protocol):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
 
         :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation
-        :param frequency_penalty: (Optional) The penalty for repeated tokens
-        :param function_call: (Optional) The function call to use
-        :param functions: (Optional) List of functions to use
-        :param logit_bias: (Optional) The logit bias to use
-        :param logprobs: (Optional) The log probabilities to use
-        :param max_completion_tokens: (Optional) The maximum number of tokens to generate
-        :param max_tokens: (Optional) The maximum number of tokens to generate
-        :param n: (Optional) The number of completions to generate
-        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls
-        :param presence_penalty: (Optional) The penalty for repeated tokens
-        :param response_format: (Optional) The response format to use
-        :param seed: (Optional) The seed to use
-        :param stop: (Optional) The stop tokens to use
-        :param stream: (Optional) Whether to stream the response
-        :param stream_options: (Optional) The stream options to use
-        :param temperature: (Optional) The temperature to use
-        :param tool_choice: (Optional) The tool choice to use
-        :param tools: (Optional) The tools to use
-        :param top_logprobs: (Optional) The top log probabilities to use
-        :param top_p: (Optional) The top p to use
-        :param user: (Optional) The user to use
+        :param messages: List of messages in the conversation.
+        :param frequency_penalty: (Optional) The penalty for repeated tokens.
+        :param function_call: (Optional) The function call to use.
+        :param functions: (Optional) List of functions to use.
+        :param logit_bias: (Optional) The logit bias to use.
+        :param logprobs: (Optional) The log probabilities to use.
+        :param max_completion_tokens: (Optional) The maximum number of tokens to generate.
+        :param max_tokens: (Optional) The maximum number of tokens to generate.
+        :param n: (Optional) The number of completions to generate.
+        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls.
+        :param presence_penalty: (Optional) The penalty for repeated tokens.
+        :param response_format: (Optional) The response format to use.
+        :param seed: (Optional) The seed to use.
+        :param stop: (Optional) The stop tokens to use.
+        :param stream: (Optional) Whether to stream the response.
+        :param stream_options: (Optional) The stream options to use.
+        :param temperature: (Optional) The temperature to use.
+        :param tool_choice: (Optional) The tool choice to use.
+        :param tools: (Optional) The tools to use.
+        :param top_logprobs: (Optional) The top log probabilities to use.
+        :param top_p: (Optional) The top p to use.
+        :param user: (Optional) The user to use.
+        :returns: An OpenAIChatCompletion.
         """
         ...
+
+    @webmethod(route="/openai/v1/embeddings", method="POST")
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        """Generate OpenAI-compatible embeddings for the given input using the specified model.
+
+        :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+        :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
+        :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
+        :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
+        :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
+        :returns: An OpenAIEmbeddingsResponse containing the embeddings.
+        """
+        ...
+
+
+class Inference(InferenceProvider):
+    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
+    """
+
+    @webmethod(route="/openai/v1/chat/completions", method="GET")
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """List all chat completions.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
+        :returns: A ListOpenAIChatCompletionResponse.
+        """
+        raise NotImplementedError("List chat completions is not implemented")
+
+    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        """Describe a chat completion by its ID.
+
+        :param completion_id: ID of the chat completion.
+        :returns: A OpenAICompletionWithInputMessages.
+        """
+        raise NotImplementedError("Get chat completion is not implemented")
diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py
index 863f90e14..44a5e95b2 100644
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -16,7 +16,7 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 class RouteInfo(BaseModel):
     route: str
     method: str
-    provider_types: List[str]
+    provider_types: list[str]
 
 
 @json_schema_type
@@ -30,16 +30,31 @@ class VersionInfo(BaseModel):
 
 
 class ListRoutesResponse(BaseModel):
-    data: List[RouteInfo]
+    data: list[RouteInfo]
 
 
 @runtime_checkable
 class Inspect(Protocol):
     @webmethod(route="/inspect/routes", method="GET")
-    async def list_routes(self) -> ListRoutesResponse: ...
+    async def list_routes(self) -> ListRoutesResponse:
+        """List all routes.
+
+        :returns: A ListRoutesResponse.
+        """
+        ...
 
     @webmethod(route="/health", method="GET")
-    async def health(self) -> HealthInfo: ...
+    async def health(self) -> HealthInfo:
+        """Get the health of the service.
+
+        :returns: A HealthInfo.
+        """
+        ...
 
     @webmethod(route="/version", method="GET")
-    async def version(self) -> VersionInfo: ...
+    async def version(self) -> VersionInfo:
+        """Get the version of the service.
+
+        :returns: A VersionInfo.
+        """
+        ...
diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py
index 97398ce75..3d90a92a0 100644
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 
 from pydantic import BaseModel, ConfigDict, Field
 
@@ -15,7 +15,7 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 
 class CommonModelFields(BaseModel):
-    metadata: Dict[str, Any] = Field(
+    metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Any additional metadata for this model",
     )
@@ -29,14 +29,14 @@ class ModelType(str, Enum):
 
 @json_schema_type
 class Model(CommonModelFields, Resource):
-    type: Literal[ResourceType.model.value] = ResourceType.model.value
+    type: Literal[ResourceType.model] = ResourceType.model
 
     @property
     def model_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_model_id(self) -> str:
+    def provider_model_id(self) -> str | None:
         return self.provider_resource_id
 
     model_config = ConfigDict(protected_namespaces=())
@@ -46,14 +46,14 @@ class Model(CommonModelFields, Resource):
 
 class ModelInput(CommonModelFields):
     model_id: str
-    provider_id: Optional[str] = None
-    provider_model_id: Optional[str] = None
-    model_type: Optional[ModelType] = ModelType.llm
+    provider_id: str | None = None
+    provider_model_id: str | None = None
+    model_type: ModelType | None = ModelType.llm
     model_config = ConfigDict(protected_namespaces=())
 
 
 class ListModelsResponse(BaseModel):
-    data: List[Model]
+    data: list[Model]
 
 
 @json_schema_type
@@ -73,36 +73,67 @@ class OpenAIModel(BaseModel):
 
 
 class OpenAIListModelsResponse(BaseModel):
-    data: List[OpenAIModel]
+    data: list[OpenAIModel]
 
 
 @runtime_checkable
 @trace_protocol
 class Models(Protocol):
     @webmethod(route="/models", method="GET")
-    async def list_models(self) -> ListModelsResponse: ...
+    async def list_models(self) -> ListModelsResponse:
+        """List all models.
+
+        :returns: A ListModelsResponse.
+        """
+        ...
 
     @webmethod(route="/openai/v1/models", method="GET")
-    async def openai_list_models(self) -> OpenAIListModelsResponse: ...
+    async def openai_list_models(self) -> OpenAIListModelsResponse:
+        """List models using the OpenAI API.
+
+        :returns: A OpenAIListModelsResponse.
+        """
+        ...
 
     @webmethod(route="/models/{model_id:path}", method="GET")
     async def get_model(
         self,
         model_id: str,
-    ) -> Model: ...
+    ) -> Model:
+        """Get a model by its identifier.
+
+        :param model_id: The identifier of the model to get.
+        :returns: A Model.
+        """
+        ...
 
     @webmethod(route="/models", method="POST")
     async def register_model(
         self,
         model_id: str,
-        provider_model_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        model_type: Optional[ModelType] = None,
-    ) -> Model: ...
+        provider_model_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        model_type: ModelType | None = None,
+    ) -> Model:
+        """Register a model.
+
+        :param model_id: The identifier of the model to register.
+        :param provider_model_id: The identifier of the model in the provider.
+        :param provider_id: The identifier of the provider.
+        :param metadata: Any additional metadata for this model.
+        :param model_type: The type of model to register.
+        :returns: A Model.
+        """
+        ...
 
     @webmethod(route="/models/{model_id:path}", method="DELETE")
     async def unregister_model(
         self,
         model_id: str,
-    ) -> None: ...
+    ) -> None:
+        """Unregister a model.
+
+        :param model_id: The identifier of the model to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index e5f1bcb65..b196c8a17 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -6,10 +6,9 @@
 
 from datetime import datetime
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+from typing import Annotated, Any, Literal, Protocol
 
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.job_types import JobStatus
@@ -36,9 +35,9 @@ class DataConfig(BaseModel):
     batch_size: int
     shuffle: bool
     data_format: DatasetFormat
-    validation_dataset_id: Optional[str] = None
-    packed: Optional[bool] = False
-    train_on_input: Optional[bool] = False
+    validation_dataset_id: str | None = None
+    packed: bool | None = False
+    train_on_input: bool | None = False
 
 
 @json_schema_type
@@ -51,10 +50,10 @@ class OptimizerConfig(BaseModel):
 
 @json_schema_type
 class EfficiencyConfig(BaseModel):
-    enable_activation_checkpointing: Optional[bool] = False
-    enable_activation_offloading: Optional[bool] = False
-    memory_efficient_fsdp_wrap: Optional[bool] = False
-    fsdp_cpu_offload: Optional[bool] = False
+    enable_activation_checkpointing: bool | None = False
+    enable_activation_offloading: bool | None = False
+    memory_efficient_fsdp_wrap: bool | None = False
+    fsdp_cpu_offload: bool | None = False
 
 
 @json_schema_type
@@ -62,23 +61,23 @@ class TrainingConfig(BaseModel):
     n_epochs: int
     max_steps_per_epoch: int = 1
     gradient_accumulation_steps: int = 1
-    max_validation_steps: Optional[int] = 1
-    data_config: Optional[DataConfig] = None
-    optimizer_config: Optional[OptimizerConfig] = None
-    efficiency_config: Optional[EfficiencyConfig] = None
-    dtype: Optional[str] = "bf16"
+    max_validation_steps: int | None = 1
+    data_config: DataConfig | None = None
+    optimizer_config: OptimizerConfig | None = None
+    efficiency_config: EfficiencyConfig | None = None
+    dtype: str | None = "bf16"
 
 
 @json_schema_type
 class LoraFinetuningConfig(BaseModel):
     type: Literal["LoRA"] = "LoRA"
-    lora_attn_modules: List[str]
+    lora_attn_modules: list[str]
     apply_lora_to_mlp: bool
     apply_lora_to_output: bool
     rank: int
     alpha: int
-    use_dora: Optional[bool] = False
-    quantize_base: Optional[bool] = False
+    use_dora: bool | None = False
+    quantize_base: bool | None = False
 
 
 @json_schema_type
@@ -88,7 +87,7 @@ class QATFinetuningConfig(BaseModel):
     group_size: int
 
 
-AlgorithmConfig = Annotated[Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")]
+AlgorithmConfig = Annotated[LoraFinetuningConfig | QATFinetuningConfig, Field(discriminator="type")]
 register_schema(AlgorithmConfig, name="AlgorithmConfig")
 
 
@@ -97,7 +96,7 @@ class PostTrainingJobLogStream(BaseModel):
     """Stream of logs from a finetuning job."""
 
     job_uuid: str
-    log_lines: List[str]
+    log_lines: list[str]
 
 
 @json_schema_type
@@ -131,8 +130,8 @@ class PostTrainingRLHFRequest(BaseModel):
     training_config: TrainingConfig
 
     # TODO: define these
-    hyperparam_search_config: Dict[str, Any]
-    logger_config: Dict[str, Any]
+    hyperparam_search_config: dict[str, Any]
+    logger_config: dict[str, Any]
 
 
 class PostTrainingJob(BaseModel):
@@ -146,17 +145,17 @@ class PostTrainingJobStatusResponse(BaseModel):
     job_uuid: str
     status: JobStatus
 
-    scheduled_at: Optional[datetime] = None
-    started_at: Optional[datetime] = None
-    completed_at: Optional[datetime] = None
+    scheduled_at: datetime | None = None
+    started_at: datetime | None = None
+    completed_at: datetime | None = None
 
-    resources_allocated: Optional[Dict[str, Any]] = None
+    resources_allocated: dict[str, Any] | None = None
 
-    checkpoints: List[Checkpoint] = Field(default_factory=list)
+    checkpoints: list[Checkpoint] = Field(default_factory=list)
 
 
 class ListPostTrainingJobsResponse(BaseModel):
-    data: List[PostTrainingJob]
+    data: list[PostTrainingJob]
 
 
 @json_schema_type
@@ -164,7 +163,7 @@ class PostTrainingJobArtifactsResponse(BaseModel):
     """Artifacts of a finetuning job."""
 
     job_uuid: str
-    checkpoints: List[Checkpoint] = Field(default_factory=list)
+    checkpoints: list[Checkpoint] = Field(default_factory=list)
 
     # TODO(ashwin): metrics, evals
 
@@ -175,15 +174,27 @@ class PostTraining(Protocol):
         self,
         job_uuid: str,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
-        model: Optional[str] = Field(
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+        model: str | None = Field(
             default=None,
             description="Model descriptor for training if not in provider config`",
         ),
-        checkpoint_dir: Optional[str] = None,
-        algorithm_config: Optional[AlgorithmConfig] = None,
-    ) -> PostTrainingJob: ...
+        checkpoint_dir: str | None = None,
+        algorithm_config: AlgorithmConfig | None = None,
+    ) -> PostTrainingJob:
+        """Run supervised fine-tuning of a model.
+
+        :param job_uuid: The UUID of the job to create.
+        :param training_config: The training configuration.
+        :param hyperparam_search_config: The hyperparam search configuration.
+        :param logger_config: The logger configuration.
+        :param model: The model to fine-tune.
+        :param checkpoint_dir: The directory to save checkpoint(s) to.
+        :param algorithm_config: The algorithm configuration.
+        :returns: A PostTrainingJob.
+        """
+        ...
 
     @webmethod(route="/post-training/preference-optimize", method="POST")
     async def preference_optimize(
@@ -192,18 +203,51 @@ class PostTraining(Protocol):
         finetuned_model: str,
         algorithm_config: DPOAlignmentConfig,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
-    ) -> PostTrainingJob: ...
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+    ) -> PostTrainingJob:
+        """Run preference optimization of a model.
+
+        :param job_uuid: The UUID of the job to create.
+        :param finetuned_model: The model to fine-tune.
+        :param algorithm_config: The algorithm configuration.
+        :param training_config: The training configuration.
+        :param hyperparam_search_config: The hyperparam search configuration.
+        :param logger_config: The logger configuration.
+        :returns: A PostTrainingJob.
+        """
+        ...
 
     @webmethod(route="/post-training/jobs", method="GET")
-    async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...
+    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
+        """Get all training jobs.
+
+        :returns: A ListPostTrainingJobsResponse.
+        """
+        ...
 
     @webmethod(route="/post-training/job/status", method="GET")
-    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse: ...
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
+        """Get the status of a training job.
+
+        :param job_uuid: The UUID of the job to get the status of.
+        :returns: A PostTrainingJobStatusResponse.
+        """
+        ...
 
     @webmethod(route="/post-training/job/cancel", method="POST")
-    async def cancel_training_job(self, job_uuid: str) -> None: ...
+    async def cancel_training_job(self, job_uuid: str) -> None:
+        """Cancel a training job.
+
+        :param job_uuid: The UUID of the job to cancel.
+        """
+        ...
 
     @webmethod(route="/post-training/job/artifacts", method="GET")
-    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: ...
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
+        """Get the artifacts of a training job.
+
+        :param job_uuid: The UUID of the job to get the artifacts of.
+        :returns: A PostTrainingJobArtifactsResponse.
+        """
+        ...
diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py
index ea5f968ec..4bc977bf1 100644
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -17,12 +17,12 @@ class ProviderInfo(BaseModel):
     api: str
     provider_id: str
     provider_type: str
-    config: Dict[str, Any]
+    config: dict[str, Any]
     health: HealthResponse
 
 
 class ListProvidersResponse(BaseModel):
-    data: List[ProviderInfo]
+    data: list[ProviderInfo]
 
 
 @runtime_checkable
@@ -32,7 +32,18 @@ class Providers(Protocol):
     """
 
     @webmethod(route="/providers", method="GET")
-    async def list_providers(self) -> ListProvidersResponse: ...
+    async def list_providers(self) -> ListProvidersResponse:
+        """List all available providers.
+
+        :returns: A ListProvidersResponse containing information about all providers.
+        """
+        ...
 
     @webmethod(route="/providers/{provider_id}", method="GET")
-    async def inspect_provider(self, provider_id: str) -> ProviderInfo: ...
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
+        """Get detailed information about a specific provider.
+
+        :param provider_id: The ID of the provider to inspect.
+        :returns: A ProviderInfo object containing the provider's details.
+        """
+        ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 70ec63c55..175baa7b9 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -4,12 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
 from enum import Enum
 
 from pydantic import BaseModel, Field
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
 
-class ResourceType(Enum):
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
+
+class ResourceType(StrEnum):
     model = "model"
     shield = "shield"
     vector_db = "vector_db"
@@ -25,9 +36,9 @@ class Resource(BaseModel):
 
     identifier: str = Field(description="Unique identifier for this resource in llama stack")
 
-    provider_resource_id: str = Field(
-        description="Unique identifier for this resource in the provider",
+    provider_resource_id: str | None = Field(
         default=None,
+        description="Unique identifier for this resource in the provider",
     )
 
     provider_id: str = Field(description="ID of the provider that owns this resource")
diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py
index fd2f0292c..3aee52b7e 100644
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 
 from pydantic import BaseModel, Field
 
@@ -27,16 +27,16 @@ class SafetyViolation(BaseModel):
     violation_level: ViolationLevel
 
     # what message should you convey to the user
-    user_message: Optional[str] = None
+    user_message: str | None = None
 
     # additional metadata (including specific violation codes) more for
     # debugging, telemetry
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 @json_schema_type
 class RunShieldResponse(BaseModel):
-    violation: Optional[SafetyViolation] = None
+    violation: SafetyViolation | None = None
 
 
 class ShieldStore(Protocol):
@@ -52,6 +52,14 @@ class Safety(Protocol):
     async def run_shield(
         self,
         shield_id: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
-    ) -> RunShieldResponse: ...
+        messages: list[Message],
+        params: dict[str, Any],
+    ) -> RunShieldResponse:
+        """Run a shield.
+
+        :param shield_id: The identifier of the shield to run.
+        :param messages: The messages to run the shield on.
+        :param params: The parameters of the shield.
+        :returns: A RunShieldResponse.
+        """
+        ...
diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py
index 54a9ac2aa..732e80e79 100644
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -12,7 +12,7 @@ from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 # mapping of metric to value
-ScoringResultRow = Dict[str, Any]
+ScoringResultRow = dict[str, Any]
 
 
 @json_schema_type
@@ -24,15 +24,15 @@ class ScoringResult(BaseModel):
     :param aggregated_results: Map of metric name to aggregated value
     """
 
-    score_rows: List[ScoringResultRow]
+    score_rows: list[ScoringResultRow]
     # aggregated metrics to value
-    aggregated_results: Dict[str, Any]
+    aggregated_results: dict[str, Any]
 
 
 @json_schema_type
 class ScoreBatchResponse(BaseModel):
-    dataset_id: Optional[str] = None
-    results: Dict[str, ScoringResult]
+    dataset_id: str | None = None
+    results: dict[str, ScoringResult]
 
 
 @json_schema_type
@@ -44,7 +44,7 @@ class ScoreResponse(BaseModel):
     """
 
     # each key in the dict is a scoring function name
-    results: Dict[str, ScoringResult]
+    results: dict[str, ScoringResult]
 
 
 class ScoringFunctionStore(Protocol):
@@ -59,20 +59,28 @@ class Scoring(Protocol):
     async def score_batch(
         self,
         dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
+        scoring_functions: dict[str, ScoringFnParams | None],
         save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse: ...
+    ) -> ScoreBatchResponse:
+        """Score a batch of rows.
+
+        :param dataset_id: The ID of the dataset to score.
+        :param scoring_functions: The scoring functions to use for the scoring.
+        :param save_results_dataset: Whether to save the results to a dataset.
+        :returns: A ScoreBatchResponse.
+        """
+        ...
 
     @webmethod(route="/scoring/score", method="POST")
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: dict[str, ScoringFnParams | None],
     ) -> ScoreResponse:
         """Score a list of rows.
 
         :param input_rows: The rows to score.
         :param scoring_functions: The scoring functions to use for the scoring.
-        :return: ScoreResponse object containing rows and aggregated results
+        :returns: A ScoreResponse object containing rows and aggregated results.
         """
         ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 4f85947dd..9cd21b7d1 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -4,37 +4,44 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+import sys
 from enum import Enum
 from typing import (
+    Annotated,
     Any,
-    Dict,
-    List,
     Literal,
-    Optional,
     Protocol,
-    Union,
     runtime_checkable,
 )
 
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated
 
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
 
 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
 # with standard metrics so they can be rolled up?
 @json_schema_type
-class ScoringFnParamsType(Enum):
+class ScoringFnParamsType(StrEnum):
     llm_as_judge = "llm_as_judge"
     regex_parser = "regex_parser"
     basic = "basic"
 
 
 @json_schema_type
-class AggregationFunctionType(Enum):
+class AggregationFunctionType(StrEnum):
     average = "average"
     weighted_average = "weighted_average"
     median = "median"
@@ -44,62 +51,58 @@ class AggregationFunctionType(Enum):
 
 @json_schema_type
 class LLMAsJudgeScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
+    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
     judge_model: str
-    prompt_template: Optional[str] = None
-    judge_score_regexes: Optional[List[str]] = Field(
+    prompt_template: str | None = None
+    judge_score_regexes: list[str] = Field(
         description="Regexes to extract the answer from generated response",
-        default_factory=list,
+        default_factory=lambda: [],
     )
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
+        default_factory=lambda: [],
     )
 
 
 @json_schema_type
 class RegexParserScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
-    parsing_regexes: Optional[List[str]] = Field(
+    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
+    parsing_regexes: list[str] = Field(
         description="Regex to extract the answer from generated response",
-        default_factory=list,
+        default_factory=lambda: [],
     )
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
+        default_factory=lambda: [],
     )
 
 
 @json_schema_type
 class BasicScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value
-    aggregation_functions: Optional[List[AggregationFunctionType]] = Field(
+    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
         default_factory=list,
     )
 
 
 ScoringFnParams = Annotated[
-    Union[
-        LLMAsJudgeScoringFnParams,
-        RegexParserScoringFnParams,
-        BasicScoringFnParams,
-    ],
+    LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams,
     Field(discriminator="type"),
 ]
 register_schema(ScoringFnParams, name="ScoringFnParams")
 
 
 class CommonScoringFnFields(BaseModel):
-    description: Optional[str] = None
-    metadata: Dict[str, Any] = Field(
+    description: str | None = None
+    metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Any additional metadata for this definition",
     )
     return_type: ParamType = Field(
         description="The return type of the deterministic function",
     )
-    params: Optional[ScoringFnParams] = Field(
+    params: ScoringFnParams | None = Field(
         description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
         default=None,
     )
@@ -107,34 +110,45 @@ class CommonScoringFnFields(BaseModel):
 
 @json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
+    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
 
     @property
     def scoring_fn_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_scoring_fn_id(self) -> str:
+    def provider_scoring_fn_id(self) -> str | None:
         return self.provider_resource_id
 
 
 class ScoringFnInput(CommonScoringFnFields, BaseModel):
     scoring_fn_id: str
-    provider_id: Optional[str] = None
-    provider_scoring_fn_id: Optional[str] = None
+    provider_id: str | None = None
+    provider_scoring_fn_id: str | None = None
 
 
 class ListScoringFunctionsResponse(BaseModel):
-    data: List[ScoringFn]
+    data: list[ScoringFn]
 
 
 @runtime_checkable
 class ScoringFunctions(Protocol):
     @webmethod(route="/scoring-functions", method="GET")
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
+    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
+        """List all scoring functions.
+
+        :returns: A ListScoringFunctionsResponse.
+        """
+        ...
 
     @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ...
+    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn:
+        """Get a scoring function by its ID.
+
+        :param scoring_fn_id: The ID of the scoring function to get.
+        :returns: A ScoringFn.
+        """
+        ...
 
     @webmethod(route="/scoring-functions", method="POST")
     async def register_scoring_function(
@@ -142,7 +156,17 @@ class ScoringFunctions(Protocol):
         scoring_fn_id: str,
         description: str,
         return_type: ParamType,
-        provider_scoring_fn_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[ScoringFnParams] = None,
-    ) -> None: ...
+        provider_scoring_fn_id: str | None = None,
+        provider_id: str | None = None,
+        params: ScoringFnParams | None = None,
+    ) -> None:
+        """Register a scoring function.
+
+        :param scoring_fn_id: The ID of the scoring function to register.
+        :param description: The description of the scoring function.
+        :param return_type: The return type of the scoring function.
+        :param provider_scoring_fn_id: The ID of the provider scoring function to use for the scoring function.
+        :param provider_id: The ID of the provider to use for the scoring function.
+        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
+        """
+        ...
diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py
index 67f3bd27b..ce1f73d8e 100644
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+from typing import Any, Literal, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -14,48 +14,68 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 
 class CommonShieldFields(BaseModel):
-    params: Optional[Dict[str, Any]] = None
+    params: dict[str, Any] | None = None
 
 
 @json_schema_type
 class Shield(CommonShieldFields, Resource):
     """A safety shield resource that can be used to check content"""
 
-    type: Literal[ResourceType.shield.value] = ResourceType.shield.value
+    type: Literal[ResourceType.shield] = ResourceType.shield
 
     @property
     def shield_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_shield_id(self) -> str:
+    def provider_shield_id(self) -> str | None:
         return self.provider_resource_id
 
 
 class ShieldInput(CommonShieldFields):
     shield_id: str
-    provider_id: Optional[str] = None
-    provider_shield_id: Optional[str] = None
+    provider_id: str | None = None
+    provider_shield_id: str | None = None
 
 
 class ListShieldsResponse(BaseModel):
-    data: List[Shield]
+    data: list[Shield]
 
 
 @runtime_checkable
 @trace_protocol
 class Shields(Protocol):
     @webmethod(route="/shields", method="GET")
-    async def list_shields(self) -> ListShieldsResponse: ...
+    async def list_shields(self) -> ListShieldsResponse:
+        """List all shields.
+
+        :returns: A ListShieldsResponse.
+        """
+        ...
 
     @webmethod(route="/shields/{identifier:path}", method="GET")
-    async def get_shield(self, identifier: str) -> Shield: ...
+    async def get_shield(self, identifier: str) -> Shield:
+        """Get a shield by its identifier.
+
+        :param identifier: The identifier of the shield to get.
+        :returns: A Shield.
+        """
+        ...
 
     @webmethod(route="/shields", method="POST")
     async def register_shield(
         self,
         shield_id: str,
-        provider_shield_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> Shield: ...
+        provider_shield_id: str | None = None,
+        provider_id: str | None = None,
+        params: dict[str, Any] | None = None,
+    ) -> Shield:
+        """Register a shield.
+
+        :param shield_id: The identifier of the shield to register.
+        :param provider_shield_id: The identifier of the shield in the provider.
+        :param provider_id: The identifier of the provider.
+        :param params: The parameters of the shield.
+        :returns: A Shield.
+        """
+        ...
diff --git a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
index 7b41192af..91e550da9 100644
--- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List, Optional, Protocol, Union
+from typing import Any, Protocol
 
 from pydantic import BaseModel
 
@@ -28,24 +28,24 @@ class FilteringFunction(Enum):
 class SyntheticDataGenerationRequest(BaseModel):
     """Request to generate synthetic data. A small batch of prompts and a filtering function"""
 
-    dialogs: List[Message]
+    dialogs: list[Message]
     filtering_function: FilteringFunction = FilteringFunction.none
-    model: Optional[str] = None
+    model: str | None = None
 
 
 @json_schema_type
 class SyntheticDataGenerationResponse(BaseModel):
     """Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."""
 
-    synthetic_data: List[Dict[str, Any]]
-    statistics: Optional[Dict[str, Any]] = None
+    synthetic_data: list[dict[str, Any]]
+    statistics: dict[str, Any] | None = None
 
 
 class SyntheticDataGeneration(Protocol):
     @webmethod(route="/synthetic-data-generation/generate")
     def synthetic_data_generate(
         self,
-        dialogs: List[Message],
+        dialogs: list[Message],
         filtering_function: FilteringFunction = FilteringFunction.none,
-        model: Optional[str] = None,
-    ) -> Union[SyntheticDataGenerationResponse]: ...
+        model: str | None = None,
+    ) -> SyntheticDataGenerationResponse: ...
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index d57c311b2..0eb53f397 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -7,18 +7,14 @@
 from datetime import datetime
 from enum import Enum
 from typing import (
+    Annotated,
     Any,
-    Dict,
-    List,
     Literal,
-    Optional,
     Protocol,
-    Union,
     runtime_checkable,
 )
 
 from pydantic import BaseModel, Field
-from typing_extensions import Annotated
 
 from llama_stack.models.llama.datatypes import Primitive
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@@ -37,11 +33,11 @@ class SpanStatus(Enum):
 class Span(BaseModel):
     span_id: str
     trace_id: str
-    parent_span_id: Optional[str] = None
+    parent_span_id: str | None = None
     name: str
     start_time: datetime
-    end_time: Optional[datetime] = None
-    attributes: Optional[Dict[str, Any]] = Field(default_factory=dict)
+    end_time: datetime | None = None
+    attributes: dict[str, Any] | None = Field(default_factory=lambda: {})
 
     def set_attribute(self, key: str, value: Any):
         if self.attributes is None:
@@ -54,7 +50,7 @@ class Trace(BaseModel):
     trace_id: str
     root_span_id: str
     start_time: datetime
-    end_time: Optional[datetime] = None
+    end_time: datetime | None = None
 
 
 @json_schema_type
@@ -78,29 +74,29 @@ class EventCommon(BaseModel):
     trace_id: str
     span_id: str
     timestamp: datetime
-    attributes: Optional[Dict[str, Primitive]] = Field(default_factory=dict)
+    attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {})
 
 
 @json_schema_type
 class UnstructuredLogEvent(EventCommon):
-    type: Literal[EventType.UNSTRUCTURED_LOG.value] = EventType.UNSTRUCTURED_LOG.value
+    type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
     message: str
     severity: LogSeverity
 
 
 @json_schema_type
 class MetricEvent(EventCommon):
-    type: Literal[EventType.METRIC.value] = EventType.METRIC.value
+    type: Literal[EventType.METRIC] = EventType.METRIC
     metric: str  # this would be an enum
-    value: Union[int, float]
+    value: int | float
     unit: str
 
 
 @json_schema_type
 class MetricInResponse(BaseModel):
     metric: str
-    value: Union[int, float]
-    unit: Optional[str] = None
+    value: int | float
+    unit: str | None = None
 
 
 # This is a short term solution to allow inference API to return metrics
@@ -124,7 +120,7 @@ class MetricInResponse(BaseModel):
 
 
 class MetricResponseMixin(BaseModel):
-    metrics: Optional[List[MetricInResponse]] = None
+    metrics: list[MetricInResponse] | None = None
 
 
 @json_schema_type
@@ -135,22 +131,19 @@ class StructuredLogType(Enum):
 
 @json_schema_type
 class SpanStartPayload(BaseModel):
-    type: Literal[StructuredLogType.SPAN_START.value] = StructuredLogType.SPAN_START.value
+    type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
     name: str
-    parent_span_id: Optional[str] = None
+    parent_span_id: str | None = None
 
 
 @json_schema_type
 class SpanEndPayload(BaseModel):
-    type: Literal[StructuredLogType.SPAN_END.value] = StructuredLogType.SPAN_END.value
+    type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
     status: SpanStatus
 
 
 StructuredLogPayload = Annotated[
-    Union[
-        SpanStartPayload,
-        SpanEndPayload,
-    ],
+    SpanStartPayload | SpanEndPayload,
     Field(discriminator="type"),
 ]
 register_schema(StructuredLogPayload, name="StructuredLogPayload")
@@ -158,16 +151,12 @@ register_schema(StructuredLogPayload, name="StructuredLogPayload")
 
 @json_schema_type
 class StructuredLogEvent(EventCommon):
-    type: Literal[EventType.STRUCTURED_LOG.value] = EventType.STRUCTURED_LOG.value
+    type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
     payload: StructuredLogPayload
 
 
 Event = Annotated[
-    Union[
-        UnstructuredLogEvent,
-        MetricEvent,
-        StructuredLogEvent,
-    ],
+    UnstructuredLogEvent | MetricEvent | StructuredLogEvent,
     Field(discriminator="type"),
 ]
 register_schema(Event, name="Event")
@@ -184,7 +173,7 @@ class EvalTrace(BaseModel):
 
 @json_schema_type
 class SpanWithStatus(Span):
-    status: Optional[SpanStatus] = None
+    status: SpanStatus | None = None
 
 
 @json_schema_type
@@ -203,58 +192,177 @@ class QueryCondition(BaseModel):
 
 
 class QueryTracesResponse(BaseModel):
-    data: List[Trace]
+    data: list[Trace]
 
 
 class QuerySpansResponse(BaseModel):
-    data: List[Span]
+    data: list[Span]
 
 
 class QuerySpanTreeResponse(BaseModel):
-    data: Dict[str, SpanWithStatus]
+    data: dict[str, SpanWithStatus]
+
+
+class MetricQueryType(Enum):
+    RANGE = "range"
+    INSTANT = "instant"
+
+
+class MetricLabelOperator(Enum):
+    EQUALS = "="
+    NOT_EQUALS = "!="
+    REGEX_MATCH = "=~"
+    REGEX_NOT_MATCH = "!~"
+
+
+class MetricLabelMatcher(BaseModel):
+    name: str
+    value: str
+    operator: MetricLabelOperator = MetricLabelOperator.EQUALS
+
+
+@json_schema_type
+class MetricLabel(BaseModel):
+    name: str
+    value: str
+
+
+@json_schema_type
+class MetricDataPoint(BaseModel):
+    timestamp: int
+    value: float
+
+
+@json_schema_type
+class MetricSeries(BaseModel):
+    metric: str
+    labels: list[MetricLabel]
+    values: list[MetricDataPoint]
+
+
+class QueryMetricsResponse(BaseModel):
+    data: list[MetricSeries]
 
 
 @runtime_checkable
 class Telemetry(Protocol):
     @webmethod(route="/telemetry/events", method="POST")
-    async def log_event(self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400) -> None: ...
+    async def log_event(
+        self,
+        event: Event,
+        ttl_seconds: int = DEFAULT_TTL_DAYS * 86400,
+    ) -> None:
+        """Log an event.
+
+        :param event: The event to log.
+        :param ttl_seconds: The time to live of the event.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces", method="POST")
     async def query_traces(
         self,
-        attribute_filters: Optional[List[QueryCondition]] = None,
-        limit: Optional[int] = 100,
-        offset: Optional[int] = 0,
-        order_by: Optional[List[str]] = None,
-    ) -> QueryTracesResponse: ...
+        attribute_filters: list[QueryCondition] | None = None,
+        limit: int | None = 100,
+        offset: int | None = 0,
+        order_by: list[str] | None = None,
+    ) -> QueryTracesResponse:
+        """Query traces.
+
+        :param attribute_filters: The attribute filters to apply to the traces.
+        :param limit: The limit of traces to return.
+        :param offset: The offset of the traces to return.
+        :param order_by: The order by of the traces to return.
+        :returns: A QueryTracesResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
-    async def get_trace(self, trace_id: str) -> Trace: ...
+    async def get_trace(self, trace_id: str) -> Trace:
+        """Get a trace by its ID.
+
+        :param trace_id: The ID of the trace to get.
+        :returns: A Trace.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
-    async def get_span(self, trace_id: str, span_id: str) -> Span: ...
+    async def get_span(self, trace_id: str, span_id: str) -> Span:
+        """Get a span by its ID.
+
+        :param trace_id: The ID of the trace to get the span from.
+        :param span_id: The ID of the span to get.
+        :returns: A Span.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST")
     async def get_span_tree(
         self,
         span_id: str,
-        attributes_to_return: Optional[List[str]] = None,
-        max_depth: Optional[int] = None,
-    ) -> QuerySpanTreeResponse: ...
+        attributes_to_return: list[str] | None = None,
+        max_depth: int | None = None,
+    ) -> QuerySpanTreeResponse:
+        """Get a span tree by its ID.
+
+        :param span_id: The ID of the span to get the tree from.
+        :param attributes_to_return: The attributes to return in the tree.
+        :param max_depth: The maximum depth of the tree.
+        :returns: A QuerySpanTreeResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans", method="POST")
     async def query_spans(
         self,
-        attribute_filters: List[QueryCondition],
-        attributes_to_return: List[str],
-        max_depth: Optional[int] = None,
-    ) -> QuerySpansResponse: ...
+        attribute_filters: list[QueryCondition],
+        attributes_to_return: list[str],
+        max_depth: int | None = None,
+    ) -> QuerySpansResponse:
+        """Query spans.
+
+        :param attribute_filters: The attribute filters to apply to the spans.
+        :param attributes_to_return: The attributes to return in the spans.
+        :param max_depth: The maximum depth of the tree.
+        :returns: A QuerySpansResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans/export", method="POST")
     async def save_spans_to_dataset(
         self,
-        attribute_filters: List[QueryCondition],
-        attributes_to_save: List[str],
+        attribute_filters: list[QueryCondition],
+        attributes_to_save: list[str],
         dataset_id: str,
-        max_depth: Optional[int] = None,
-    ) -> None: ...
+        max_depth: int | None = None,
+    ) -> None:
+        """Save spans to a dataset.
+
+        :param attribute_filters: The attribute filters to apply to the spans.
+        :param attributes_to_save: The attributes to save to the dataset.
+        :param dataset_id: The ID of the dataset to save the spans to.
+        :param max_depth: The maximum depth of the tree.
+        """
+        ...
+
+    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST")
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: int,
+        end_time: int | None = None,
+        granularity: str | None = "1d",
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse:
+        """Query metrics.
+
+        :param metric_name: The name of the metric to query.
+        :param start_time: The start time of the metric to query.
+        :param end_time: The end time of the metric to query.
+        :param granularity: The granularity of the metric to query.
+        :param query_type: The type of query to perform.
+        :param label_matchers: The label matchers to apply to the metric.
+        :returns: A QueryMetricsResponse.
+        """
+        ...
diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py
index 73b36e050..1e3542f74 100644
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@@ -5,10 +5,10 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated, Protocol, runtime_checkable
+from pydantic import BaseModel, Field, field_validator
+from typing_extensions import Protocol, runtime_checkable
 
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@@ -29,13 +29,13 @@ class RAGDocument(BaseModel):
     document_id: str
     content: InterleavedContent | URL
     mime_type: str | None = None
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 @json_schema_type
 class RAGQueryResult(BaseModel):
-    content: Optional[InterleavedContent] = None
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    content: InterleavedContent | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
 @json_schema_type
@@ -59,10 +59,7 @@ class LLMRAGQueryGeneratorConfig(BaseModel):
 
 
 RAGQueryGeneratorConfig = Annotated[
-    Union[
-        DefaultRAGQueryGeneratorConfig,
-        LLMRAGQueryGeneratorConfig,
-    ],
+    DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
     Field(discriminator="type"),
 ]
 register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
@@ -70,11 +67,35 @@ register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
 
 @json_schema_type
 class RAGQueryConfig(BaseModel):
+    """
+    Configuration for the RAG query generation.
+
+    :param query_generator_config: Configuration for the query generator.
+    :param max_tokens_in_context: Maximum number of tokens in the context.
+    :param max_chunks: Maximum number of chunks to retrieve.
+    :param chunk_template: Template for formatting each retrieved chunk in the context.
+        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
+        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
+    :param mode: Search mode for retrieval—either "vector" or "keyword". Default "vector".
+    """
+
     # This config defines how a query is generated using the messages
     # for memory bank retrieval.
     query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
     max_tokens_in_context: int = 4096
     max_chunks: int = 5
+    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
+    mode: str | None = None
+
+    @field_validator("chunk_template")
+    def validate_chunk_template(cls, v: str) -> str:
+        if "{chunk.content}" not in v:
+            raise ValueError("chunk_template must contain {chunk.content}")
+        if "{index}" not in v:
+            raise ValueError("chunk_template must contain {index}")
+        if len(v) == 0:
+            raise ValueError("chunk_template must not be empty")
+        return v
 
 
 @runtime_checkable
@@ -83,7 +104,7 @@ class RAGToolRuntime(Protocol):
     @webmethod(route="/tool-runtime/rag-tool/insert", method="POST")
     async def insert(
         self,
-        documents: List[RAGDocument],
+        documents: list[RAGDocument],
         vector_db_id: str,
         chunk_size_in_tokens: int = 512,
     ) -> None:
@@ -94,8 +115,8 @@ class RAGToolRuntime(Protocol):
     async def query(
         self,
         content: InterleavedContent,
-        vector_db_ids: List[str],
-        query_config: Optional[RAGQueryConfig] = None,
+        vector_db_ids: list[str],
+        query_config: RAGQueryConfig | None = None,
     ) -> RAGQueryResult:
         """Query the RAG system for context; typically invoked by the agent"""
         ...
diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py
index 4ca72f71d..0c8d47edf 100644
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Literal
 
 from pydantic import BaseModel, Field
 from typing_extensions import Protocol, runtime_checkable
@@ -24,68 +24,60 @@ class ToolParameter(BaseModel):
     parameter_type: str
     description: str
     required: bool = Field(default=True)
-    default: Optional[Any] = None
-
-
-@json_schema_type
-class ToolHost(Enum):
-    distribution = "distribution"
-    client = "client"
-    model_context_protocol = "model_context_protocol"
+    default: Any | None = None
 
 
 @json_schema_type
 class Tool(Resource):
-    type: Literal[ResourceType.tool.value] = ResourceType.tool.value
+    type: Literal[ResourceType.tool] = ResourceType.tool
     toolgroup_id: str
-    tool_host: ToolHost
     description: str
-    parameters: List[ToolParameter]
-    metadata: Optional[Dict[str, Any]] = None
+    parameters: list[ToolParameter]
+    metadata: dict[str, Any] | None = None
 
 
 @json_schema_type
 class ToolDef(BaseModel):
     name: str
-    description: Optional[str] = None
-    parameters: Optional[List[ToolParameter]] = None
-    metadata: Optional[Dict[str, Any]] = None
+    description: str | None = None
+    parameters: list[ToolParameter] | None = None
+    metadata: dict[str, Any] | None = None
 
 
 @json_schema_type
 class ToolGroupInput(BaseModel):
     toolgroup_id: str
     provider_id: str
-    args: Optional[Dict[str, Any]] = None
-    mcp_endpoint: Optional[URL] = None
+    args: dict[str, Any] | None = None
+    mcp_endpoint: URL | None = None
 
 
 @json_schema_type
 class ToolGroup(Resource):
-    type: Literal[ResourceType.tool_group.value] = ResourceType.tool_group.value
-    mcp_endpoint: Optional[URL] = None
-    args: Optional[Dict[str, Any]] = None
+    type: Literal[ResourceType.tool_group] = ResourceType.tool_group
+    mcp_endpoint: URL | None = None
+    args: dict[str, Any] | None = None
 
 
 @json_schema_type
 class ToolInvocationResult(BaseModel):
-    content: Optional[InterleavedContent] = None
-    error_message: Optional[str] = None
-    error_code: Optional[int] = None
-    metadata: Optional[Dict[str, Any]] = None
+    content: InterleavedContent | None = None
+    error_message: str | None = None
+    error_code: int | None = None
+    metadata: dict[str, Any] | None = None
 
 
 class ToolStore(Protocol):
-    def get_tool(self, tool_name: str) -> Tool: ...
-    def get_tool_group(self, toolgroup_id: str) -> ToolGroup: ...
+    async def get_tool(self, tool_name: str) -> Tool: ...
+    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup: ...
 
 
 class ListToolGroupsResponse(BaseModel):
-    data: List[ToolGroup]
+    data: list[ToolGroup]
 
 
 class ListToolsResponse(BaseModel):
-    data: List[Tool]
+    data: list[Tool]
 
 
 class ListToolDefsResponse(BaseModel):
@@ -100,40 +92,68 @@ class ToolGroups(Protocol):
         self,
         toolgroup_id: str,
         provider_id: str,
-        mcp_endpoint: Optional[URL] = None,
-        args: Optional[Dict[str, Any]] = None,
+        mcp_endpoint: URL | None = None,
+        args: dict[str, Any] | None = None,
     ) -> None:
-        """Register a tool group"""
+        """Register a tool group.
+
+        :param toolgroup_id: The ID of the tool group to register.
+        :param provider_id: The ID of the provider to use for the tool group.
+        :param mcp_endpoint: The MCP endpoint to use for the tool group.
+        :param args: A dictionary of arguments to pass to the tool group.
+        """
         ...
 
     @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
     async def get_tool_group(
         self,
         toolgroup_id: str,
-    ) -> ToolGroup: ...
+    ) -> ToolGroup:
+        """Get a tool group by its ID.
+
+        :param toolgroup_id: The ID of the tool group to get.
+        :returns: A ToolGroup.
+        """
+        ...
 
     @webmethod(route="/toolgroups", method="GET")
     async def list_tool_groups(self) -> ListToolGroupsResponse:
-        """List tool groups with optional provider"""
+        """List tool groups with optional provider.
+
+        :returns: A ListToolGroupsResponse.
+        """
         ...
 
     @webmethod(route="/tools", method="GET")
-    async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:
-        """List tools with optional tool group"""
+    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
+        """List tools with optional tool group.
+
+        :param toolgroup_id: The ID of the tool group to list tools for.
+        :returns: A ListToolsResponse.
+        """
         ...
 
     @webmethod(route="/tools/{tool_name:path}", method="GET")
     async def get_tool(
         self,
         tool_name: str,
-    ) -> Tool: ...
+    ) -> Tool:
+        """Get a tool by its name.
+
+        :param tool_name: The name of the tool to get.
+        :returns: A Tool.
+        """
+        ...
 
     @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
     async def unregister_toolgroup(
         self,
         toolgroup_id: str,
     ) -> None:
-        """Unregister a tool group"""
+        """Unregister a tool group.
+
+        :param toolgroup_id: The ID of the tool group to unregister.
+        """
         ...
 
 
@@ -151,10 +171,22 @@ class ToolRuntime(Protocol):
     # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
     @webmethod(route="/tool-runtime/list-tools", method="GET")
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> ListToolDefsResponse: ...
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
+    ) -> ListToolDefsResponse:
+        """List all tools in the runtime.
+
+        :param tool_group_id: The ID of the tool group to list tools for.
+        :param mcp_endpoint: The MCP endpoint to use for the tool group.
+        :returns: A ListToolDefsResponse.
+        """
+        ...
 
     @webmethod(route="/tool-runtime/invoke", method="POST")
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
-        """Run a tool with the given arguments"""
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
+        """Run a tool with the given arguments.
+
+        :param tool_name: The name of the tool to invoke.
+        :param kwargs: A dictionary of arguments to pass to the tool.
+        :returns: A ToolInvocationResult.
+        """
         ...
diff --git a/llama_stack/apis/vector_dbs/vector_dbs.py b/llama_stack/apis/vector_dbs/vector_dbs.py
index fe6c33919..405852476 100644
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Literal, Optional, Protocol, runtime_checkable
+from typing import Literal, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
@@ -15,7 +15,7 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 @json_schema_type
 class VectorDB(Resource):
-    type: Literal[ResourceType.vector_db.value] = ResourceType.vector_db.value
+    type: Literal[ResourceType.vector_db] = ResourceType.vector_db
 
     embedding_model: str
     embedding_dimension: int
@@ -25,7 +25,7 @@ class VectorDB(Resource):
         return self.identifier
 
     @property
-    def provider_vector_db_id(self) -> str:
+    def provider_vector_db_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -33,34 +33,60 @@ class VectorDBInput(BaseModel):
     vector_db_id: str
     embedding_model: str
     embedding_dimension: int
-    provider_vector_db_id: Optional[str] = None
+    provider_vector_db_id: str | None = None
 
 
 class ListVectorDBsResponse(BaseModel):
-    data: List[VectorDB]
+    data: list[VectorDB]
 
 
 @runtime_checkable
 @trace_protocol
 class VectorDBs(Protocol):
     @webmethod(route="/vector-dbs", method="GET")
-    async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
+    async def list_vector_dbs(self) -> ListVectorDBsResponse:
+        """List all vector databases.
+
+        :returns: A ListVectorDBsResponse.
+        """
+        ...
 
     @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
     async def get_vector_db(
         self,
         vector_db_id: str,
-    ) -> VectorDB: ...
+    ) -> VectorDB:
+        """Get a vector database by its identifier.
+
+        :param vector_db_id: The identifier of the vector database to get.
+        :returns: A VectorDB.
+        """
+        ...
 
     @webmethod(route="/vector-dbs", method="POST")
     async def register_vector_db(
         self,
         vector_db_id: str,
         embedding_model: str,
-        embedding_dimension: Optional[int] = 384,
-        provider_id: Optional[str] = None,
-        provider_vector_db_id: Optional[str] = None,
-    ) -> VectorDB: ...
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        provider_vector_db_id: str | None = None,
+    ) -> VectorDB:
+        """Register a vector database.
+
+        :param vector_db_id: The identifier of the vector database to register.
+        :param embedding_model: The embedding model to use.
+        :param embedding_dimension: The dimension of the embedding model.
+        :param provider_id: The identifier of the provider.
+        :param provider_vector_db_id: The identifier of the vector database in the provider.
+        :returns: A VectorDB.
+        """
+        ...
 
     @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
-    async def unregister_vector_db(self, vector_db_id: str) -> None: ...
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        """Unregister a vector database.
+
+        :param vector_db_id: The identifier of the vector database to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py
index ab0a4a20a..44cc8f904 100644
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@@ -8,7 +8,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
+from typing import Any, Protocol, runtime_checkable
 
 from pydantic import BaseModel, Field
 
@@ -19,18 +19,26 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 
 class Chunk(BaseModel):
+    """
+    A chunk of content that can be inserted into a vector database.
+    :param content: The content of the chunk, which can be interleaved text, images, or other types.
+    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
+    :param metadata: Metadata associated with the chunk, such as document ID, source, or other relevant information.
+    """
+
     content: InterleavedContent
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    embedding: list[float] | None = None
 
 
 @json_schema_type
 class QueryChunksResponse(BaseModel):
-    chunks: List[Chunk]
-    scores: List[float]
+    chunks: list[Chunk]
+    scores: list[float]
 
 
 class VectorDBStore(Protocol):
-    def get_vector_db(self, vector_db_id: str) -> Optional[VectorDB]: ...
+    def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...
 
 
 @runtime_checkable
@@ -44,14 +52,32 @@ class VectorIO(Protocol):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
-    ) -> None: ...
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
+    ) -> None:
+        """Insert chunks into a vector database.
+
+        :param vector_db_id: The identifier of the vector database to insert the chunks into.
+        :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
+            `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
+            If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
+            If `embedding` is not provided, it will be computed later.
+        :param ttl_seconds: The time to live of the chunks.
+        """
+        ...
 
     @webmethod(route="/vector-io/query", method="POST")
     async def query_chunks(
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> QueryChunksResponse: ...
+        params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        """Query chunks from a vector database.
+
+        :param vector_db_id: The identifier of the vector database to query.
+        :param query: The query to search for.
+        :param params: The parameters of the query.
+        :returns: A QueryChunksResponse.
+        """
+        ...
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index 9694bf22d..b96842119 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -9,11 +9,11 @@ import asyncio
 import json
 import os
 import shutil
+import sys
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Optional
 
 import httpx
 from pydantic import BaseModel, ConfigDict
@@ -102,7 +102,7 @@ class DownloadTask:
     output_file: str
     total_size: int = 0
     downloaded_size: int = 0
-    task_id: Optional[int] = None
+    task_id: int | None = None
     retries: int = 0
     max_retries: int = 3
 
@@ -262,7 +262,7 @@ class ParallelDownloader:
             self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
             raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
 
-    def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
+    def has_disk_space(self, tasks: list[DownloadTask]) -> bool:
         try:
             total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
             dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
@@ -282,7 +282,7 @@ class ParallelDownloader:
         except Exception as e:
             raise DownloadError(f"Failed to check disk space: {str(e)}") from e
 
-    async def download_all(self, tasks: List[DownloadTask]) -> None:
+    async def download_all(self, tasks: list[DownloadTask]) -> None:
         if not tasks:
             raise ValueError("No download tasks provided")
 
@@ -378,33 +378,34 @@ def _meta_download(
     downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
     asyncio.run(downloader.download_all(tasks))
 
-    cprint(f"\nSuccessfully downloaded model to {output_dir}", "green")
+    cprint(f"\nSuccessfully downloaded model to {output_dir}", color="green", file=sys.stderr)
     cprint(
         f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
-        "white",
+        file=sys.stderr,
     )
     cprint(
         f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
-        "yellow",
+        color="yellow",
+        file=sys.stderr,
     )
 
 
 class ModelEntry(BaseModel):
     model_id: str
-    files: Dict[str, str]
+    files: dict[str, str]
 
     model_config = ConfigDict(protected_namespaces=())
 
 
 class Manifest(BaseModel):
-    models: List[ModelEntry]
+    models: list[ModelEntry]
     expires_on: datetime
 
 
 def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
     from llama_stack.distribution.utils.model_utils import model_local_dir
 
-    with open(manifest_file, "r") as f:
+    with open(manifest_file) as f:
         d = json.load(f)
         manifest = Manifest(**d)
 
@@ -460,15 +461,17 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
         from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model
 
         from .model.safety_models import (
-            prompt_guard_download_info,
-            prompt_guard_model_sku,
+            prompt_guard_download_info_map,
+            prompt_guard_model_sku_map,
         )
 
-        prompt_guard = prompt_guard_model_sku()
+        prompt_guard_model_sku_map = prompt_guard_model_sku_map()
+        prompt_guard_download_info_map = prompt_guard_download_info_map()
+
         for model_id in model_ids:
-            if model_id == prompt_guard.model_id:
-                model = prompt_guard
-                info = prompt_guard_download_info()
+            if model_id in prompt_guard_model_sku_map.keys():
+                model = prompt_guard_model_sku_map[model_id]
+                info = prompt_guard_download_info_map[model_id]
             else:
                 model = resolve_model(model_id)
                 if model is None:
diff --git a/llama_stack/cli/llama.py b/llama_stack/cli/llama.py
index 8ff580029..433b311e7 100644
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@@ -38,7 +38,10 @@ class LlamaCLIParser:
         print_subcommand_description(self.parser, subparsers)
 
     def parse_args(self) -> argparse.Namespace:
-        return self.parser.parse_args()
+        args = self.parser.parse_args()
+        if not isinstance(args, argparse.Namespace):
+            raise TypeError(f"Expected argparse.Namespace, got {type(args)}")
+        return args
 
     def run(self, args: argparse.Namespace) -> None:
         args.func(args)
diff --git a/llama_stack/cli/model/describe.py b/llama_stack/cli/model/describe.py
index 62dde36e8..26b0da686 100644
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@@ -36,11 +36,11 @@ class ModelDescribe(Subcommand):
         )
 
     def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_sku_map
 
-        prompt_guard = prompt_guard_model_sku()
-        if args.model_id == prompt_guard.model_id:
-            model = prompt_guard
+        prompt_guard_model_map = prompt_guard_model_sku_map()
+        if args.model_id in prompt_guard_model_map.keys():
+            model = prompt_guard_model_map[args.model_id]
         else:
             model = resolve_model(args.model_id)
 
diff --git a/llama_stack/cli/model/list.py b/llama_stack/cli/model/list.py
index b9499f06d..cf84dd526 100644
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@@ -84,7 +84,7 @@ class ModelList(Subcommand):
         )
 
     def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_skus
 
         if args.downloaded:
             return _run_model_list_downloaded_cmd()
@@ -96,7 +96,7 @@ class ModelList(Subcommand):
         ]
 
         rows = []
-        for model in all_registered_models() + [prompt_guard_model_sku()]:
+        for model in all_registered_models() + prompt_guard_model_skus():
             if not args.show_all and not model.is_featured:
                 continue
 
diff --git a/llama_stack/cli/model/remove.py b/llama_stack/cli/model/remove.py
index ee8d6299d..98710d82b 100644
--- a/llama_stack/cli/model/remove.py
+++ b/llama_stack/cli/model/remove.py
@@ -42,11 +42,12 @@ class ModelRemove(Subcommand):
         )
 
     def _run_model_remove_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku
+        from .safety_models import prompt_guard_model_sku_map
 
-        prompt_guard = prompt_guard_model_sku()
-        if args.model == prompt_guard.model_id:
-            model = prompt_guard
+        prompt_guard_model_map = prompt_guard_model_sku_map()
+
+        if args.model in prompt_guard_model_map.keys():
+            model = prompt_guard_model_map[args.model]
         else:
             model = resolve_model(args.model)
 
diff --git a/llama_stack/cli/model/safety_models.py b/llama_stack/cli/model/safety_models.py
index 131d055aa..e31767f13 100644
--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, ConfigDict, Field
 
@@ -15,14 +15,14 @@ from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat
 class PromptGuardModel(BaseModel):
     """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
 
-    model_id: str = "Prompt-Guard-86M"
+    model_id: str
+    huggingface_repo: str
     description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
     is_featured: bool = False
-    huggingface_repo: str = "meta-llama/Prompt-Guard-86M"
-    max_seq_length: int = 2048
+    max_seq_length: int = 512
     is_instruct_model: bool = False
     quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
-    arch_args: Dict[str, Any] = Field(default_factory=dict)
+    arch_args: dict[str, Any] = Field(default_factory=dict)
 
     def descriptor(self) -> str:
         return self.model_id
@@ -30,18 +30,35 @@ class PromptGuardModel(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
 
 
-def prompt_guard_model_sku():
-    return PromptGuardModel()
+def prompt_guard_model_skus():
+    return [
+        PromptGuardModel(model_id="Prompt-Guard-86M", huggingface_repo="meta-llama/Prompt-Guard-86M"),
+        PromptGuardModel(
+            model_id="Llama-Prompt-Guard-2-86M",
+            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-86M",
+        ),
+        PromptGuardModel(
+            model_id="Llama-Prompt-Guard-2-22M",
+            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-22M",
+        ),
+    ]
 
 
-def prompt_guard_download_info():
-    return LlamaDownloadInfo(
-        folder="Prompt-Guard",
-        files=[
-            "model.safetensors",
-            "special_tokens_map.json",
-            "tokenizer.json",
-            "tokenizer_config.json",
-        ],
-        pth_size=1,
-    )
+def prompt_guard_model_sku_map() -> dict[str, Any]:
+    return {model.model_id: model for model in prompt_guard_model_skus()}
+
+
+def prompt_guard_download_info_map() -> dict[str, LlamaDownloadInfo]:
+    return {
+        model.model_id: LlamaDownloadInfo(
+            folder="Prompt-Guard" if model.model_id == "Prompt-Guard-86M" else model.model_id,
+            files=[
+                "model.safetensors",
+                "special_tokens_map.json",
+                "tokenizer.json",
+                "tokenizer_config.json",
+            ],
+            pth_size=1,
+        )
+        for model in prompt_guard_model_skus()
+    }
diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index 2787a93d5..f6f72946a 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -12,14 +12,14 @@ import shutil
 import sys
 import textwrap
 from functools import lru_cache
+from importlib.abc import Traversable
 from pathlib import Path
-from typing import Dict, Optional
 
 import yaml
 from prompt_toolkit import prompt
 from prompt_toolkit.completion import WordCompleter
 from prompt_toolkit.validation import Validator
-from termcolor import cprint
+from termcolor import colored, cprint
 
 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.table import print_table
@@ -37,7 +37,8 @@ from llama_stack.distribution.datatypes import (
 )
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.resolver import InvalidProviderError
-from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+from llama_stack.distribution.stack import replace_env_vars
+from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.exec import formulate_run_args, run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
@@ -46,14 +47,14 @@ from llama_stack.providers.datatypes import Api
 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"
 
 
-@lru_cache()
-def available_templates_specs() -> Dict[str, BuildConfig]:
+@lru_cache
+def available_templates_specs() -> dict[str, BuildConfig]:
     import yaml
 
     template_specs = {}
     for p in TEMPLATES_PATH.rglob("*build.yaml"):
         template_name = p.parent.name
-        with open(p, "r") as f:
+        with open(p) as f:
             build_config = BuildConfig(**yaml.safe_load(f))
             template_specs[template_name] = build_config
     return template_specs
@@ -78,6 +79,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
             cprint(
                 f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates",
                 color="red",
+                file=sys.stderr,
             )
             sys.exit(1)
         build_config = available_templates[args.template]
@@ -87,6 +89,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
             cprint(
                 f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {args.template}",
                 color="red",
+                file=sys.stderr,
             )
             sys.exit(1)
     elif args.providers:
@@ -96,6 +99,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 cprint(
                     "Could not parse `--providers`. Please ensure the list is in the format api1=provider1,api2=provider2",
                     color="red",
+                    file=sys.stderr,
                 )
                 sys.exit(1)
             api, provider = api_provider.split("=")
@@ -104,6 +108,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 cprint(
                     f"{api} is not a valid API.",
                     color="red",
+                    file=sys.stderr,
                 )
                 sys.exit(1)
             if provider in providers_for_api:
@@ -112,6 +117,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 cprint(
                     f"{provider} is not a valid provider for the {api} API.",
                     color="red",
+                    file=sys.stderr,
                 )
                 sys.exit(1)
         distribution_spec = DistributionSpec(
@@ -122,6 +128,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
             cprint(
                 f"Please specify a image-type (container | conda | venv) for {args.template}",
                 color="red",
+                file=sys.stderr,
             )
             sys.exit(1)
 
@@ -150,12 +157,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 cprint(
                     f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
                     color="yellow",
+                    file=sys.stderr,
                 )
                 image_name = f"llamastack-{name}"
             else:
                 cprint(
                     f"Using conda environment {image_name}",
                     color="green",
+                    file=sys.stderr,
                 )
         else:
             image_name = f"llamastack-{name}"
@@ -168,9 +177,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
             """,
             ),
             color="green",
+            file=sys.stderr,
         )
 
-        print("Tip: use  to see options for the providers.\n")
+        cprint("Tip: use  to see options for the providers.\n", color="green", file=sys.stderr)
 
         providers = dict()
         for api, providers_for_api in get_provider_registry().items():
@@ -178,7 +188,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
             if not available_providers:
                 continue
             api_provider = prompt(
-                "> Enter provider for API {}: ".format(api.value),
+                f"> Enter provider for API {api.value}: ",
                 completer=WordCompleter(available_providers),
                 complete_while_typing=True,
                 validator=Validator.from_callable(
@@ -201,13 +211,18 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
 
         build_config = BuildConfig(image_type=image_type, distribution_spec=distribution_spec)
     else:
-        with open(args.config, "r") as f:
+        with open(args.config) as f:
             try:
-                build_config = BuildConfig(**yaml.safe_load(f))
+                contents = yaml.safe_load(f)
+                contents = replace_env_vars(contents)
+                build_config = BuildConfig(**contents)
+                if args.image_type:
+                    build_config.image_type = args.image_type
             except Exception as e:
                 cprint(
                     f"Could not parse config file {args.config}: {e}",
                     color="red",
+                    file=sys.stderr,
                 )
                 sys.exit(1)
 
@@ -234,23 +249,27 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
         cprint(
             f"Error building stack: {exc}",
             color="red",
+            file=sys.stderr,
         )
-        cprint("Stack trace:", color="red")
+        cprint("Stack trace:", color="red", file=sys.stderr)
         traceback.print_exc()
         sys.exit(1)
+
     if run_config is None:
         cprint(
             "Run config path is empty",
             color="red",
+            file=sys.stderr,
         )
         sys.exit(1)
 
     if args.run:
-        run_config = Path(run_config)
         config_dict = yaml.safe_load(run_config.read_text())
         config = parse_and_maybe_upgrade_config(config_dict)
+        if config.external_providers_dir and not config.external_providers_dir.exists():
+            config.external_providers_dir.mkdir(exist_ok=True)
         run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
-        run_args.extend([run_config, str(os.getenv("LLAMA_STACK_PORT", 8321))])
+        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", run_config])
         run_command(run_args)
 
 
@@ -258,7 +277,7 @@ def _generate_run_config(
     build_config: BuildConfig,
     build_dir: Path,
     image_name: str,
-) -> str:
+) -> Path:
     """
     Generate a run.yaml template file for user to edit from a build.yaml file
     """
@@ -268,7 +287,9 @@ def _generate_run_config(
         image_name=image_name,
         apis=apis,
         providers={},
-        external_providers_dir=build_config.external_providers_dir if build_config.external_providers_dir else None,
+        external_providers_dir=build_config.external_providers_dir
+        if build_config.external_providers_dir
+        else EXTERNAL_PROVIDERS_DIR,
     )
     # build providers dict
     provider_registry = get_provider_registry(build_config)
@@ -296,6 +317,7 @@ def _generate_run_config(
                 cprint(
                     f"Failed to import provider {provider_type} for API {api} - assuming it's external, skipping",
                     color="yellow",
+                    file=sys.stderr,
                 )
                 # Set config_type to None to avoid UnboundLocalError
                 config_type = None
@@ -323,19 +345,16 @@ def _generate_run_config(
     # For non-container builds, the run.yaml is generated at the very end of the build process so it
     # makes sense to display this message
     if build_config.image_type != LlamaStackImageType.CONTAINER.value:
-        cprint(
-            f"You can now run your stack with `llama stack run {run_config_file}`",
-            color="green",
-        )
+        cprint(f"You can now run your stack with `llama stack run {run_config_file}`", color="green", file=sys.stderr)
     return run_config_file
 
 
 def _run_stack_build_command_from_build_config(
     build_config: BuildConfig,
-    image_name: Optional[str] = None,
-    template_name: Optional[str] = None,
-    config_path: Optional[str] = None,
-) -> str:
+    image_name: str | None = None,
+    template_name: str | None = None,
+    config_path: str | None = None,
+) -> Path | Traversable:
     image_name = image_name or build_config.image_name
     if build_config.image_type == LlamaStackImageType.CONTAINER.value:
         if template_name:
@@ -364,7 +383,7 @@ def _run_stack_build_command_from_build_config(
     # Generate the run.yaml so it can be included in the container image with the proper entrypoint
     # Only do this if we're building a container image and we're not using a template
     if build_config.image_type == LlamaStackImageType.CONTAINER.value and not template_name and config_path:
-        cprint("Generating run.yaml file", color="green")
+        cprint("Generating run.yaml file", color="yellow", file=sys.stderr)
         run_config_file = _generate_run_config(build_config, build_dir, image_name)
 
     with open(build_file_path, "w") as f:
@@ -388,7 +407,14 @@ def _run_stack_build_command_from_build_config(
             run_config_file = build_dir / f"{template_name}-run.yaml"
             shutil.copy(path, run_config_file)
 
-        cprint("Build Successful!", color="green")
+        cprint("Build Successful!", color="green", file=sys.stderr)
+        cprint(f"You can find the newly-built template here: {template_path}", color="light_blue", file=sys.stderr)
+        cprint(
+            "You can run the new Llama Stack distro via: "
+            + colored(f"llama stack run {template_path} --image-type {build_config.image_type}", "light_blue"),
+            color="green",
+            file=sys.stderr,
+        )
         return template_path
     else:
         return _generate_run_config(build_config, build_dir, image_name)
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index 93e7d9b22..2c402beeb 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -49,7 +49,7 @@ class StackBuild(Subcommand):
             type=str,
             help="Image Type to use for the build. If not specified, will use the image type from the template config.",
             choices=[e.value for e in ImageType],
-            default=ImageType.CONDA.value,
+            default=None,  # no default so we can detect if a user specified --image-type and override image_type in the config
         )
 
         self.parser.add_argument(
diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py
index bfe11aa2c..deebd937b 100644
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@@ -46,7 +46,7 @@ class StackListProviders(Subcommand):
         else:
             providers = [(k.value, prov) for k, prov in all_providers.items()]
 
-        providers = [p for api, p in providers if api in self.providable_apis]
+        providers = [(api, p) for api, p in providers if api in self.providable_apis]
 
         # eventually, this should query a registry at llama.meta.com/llamastack/distributions
         headers = [
@@ -57,7 +57,7 @@ class StackListProviders(Subcommand):
 
         rows = []
 
-        specs = [spec for p in providers for spec in p.values()]
+        specs = [spec for api, p in providers for spec in p.values()]
         for spec in specs:
             if spec.is_sample:
                 continue
@@ -65,7 +65,7 @@ class StackListProviders(Subcommand):
                 [
                     spec.api.value,
                     spec.provider_type,
-                    ",".join(spec.pip_packages),
+                    ",".join(spec.pip_packages) if hasattr(spec, "pip_packages") else "",
                 ]
             )
         print_table(
diff --git a/llama_stack/cli/stack/list_stacks.py b/llama_stack/cli/stack/list_stacks.py
new file mode 100644
index 000000000..2ea0fdeea
--- /dev/null
+++ b/llama_stack/cli/stack/list_stacks.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackListBuilds(Subcommand):
+    """List built stacks in .llama/distributions directory"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list",
+            prog="llama stack list",
+            description="list the build stacks",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._list_stack_command)
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stack_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if not distributions:
+            print("No stacks found in ~/.llama/distributions")
+            return
+
+        headers = ["Stack Name", "Path"]
+        headers.extend(["Build Config", "Run Config"])
+        rows = []
+        for name, path in distributions.items():
+            row = [name, str(path)]
+            # Check for build and run config files
+            build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
+            run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
+            row.extend([build_config, run_config])
+            rows.append(row)
+        print_table(rows, headers, separate_rows=True)
diff --git a/llama_stack/cli/stack/remove.py b/llama_stack/cli/stack/remove.py
new file mode 100644
index 000000000..a1796941e
--- /dev/null
+++ b/llama_stack/cli/stack/remove.py
@@ -0,0 +1,115 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import shutil
+import sys
+from pathlib import Path
+
+from termcolor import cprint
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class StackRemove(Subcommand):
+    """Remove the build stack"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "rm",
+            prog="llama stack rm",
+            description="Remove the build stack",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._remove_stack_build_command)
+
+    def _add_arguments(self) -> None:
+        self.parser.add_argument(
+            "name",
+            type=str,
+            nargs="?",
+            help="Name of the stack to delete",
+        )
+        self.parser.add_argument(
+            "--all",
+            "-a",
+            action="store_true",
+            help="Delete all stacks (use with caution)",
+        )
+
+    def _get_distribution_dirs(self) -> dict[str, Path]:
+        """Return a dictionary of distribution names and their paths"""
+        distributions = {}
+        dist_dir = Path.home() / ".llama" / "distributions"
+
+        if dist_dir.exists():
+            for stack_dir in dist_dir.iterdir():
+                if stack_dir.is_dir():
+                    distributions[stack_dir.name] = stack_dir
+        return distributions
+
+    def _list_stacks(self) -> None:
+        """Display available stacks in a table"""
+        distributions = self._get_distribution_dirs()
+        if not distributions:
+            cprint("No stacks found in ~/.llama/distributions", color="red", file=sys.stderr)
+            sys.exit(1)
+
+        headers = ["Stack Name", "Path"]
+        rows = [[name, str(path)] for name, path in distributions.items()]
+        print_table(rows, headers, separate_rows=True)
+
+    def _remove_stack_build_command(self, args: argparse.Namespace) -> None:
+        distributions = self._get_distribution_dirs()
+
+        if args.all:
+            confirm = input("Are you sure you want to delete ALL stacks? [yes-i-really-want/N] ").lower()
+            if confirm != "yes-i-really-want":
+                cprint("Deletion cancelled.", color="green", file=sys.stderr)
+                return
+
+            for name, path in distributions.items():
+                try:
+                    shutil.rmtree(path)
+                    cprint(f"Deleted stack: {name}", color="green", file=sys.stderr)
+                except Exception as e:
+                    cprint(
+                        f"Failed to delete stack {name}: {e}",
+                        color="red",
+                        file=sys.stderr,
+                    )
+                    sys.exit(1)
+
+        if not args.name:
+            self._list_stacks()
+            if not args.name:
+                return
+
+        if args.name not in distributions:
+            self._list_stacks()
+            cprint(
+                f"Stack not found: {args.name}",
+                color="red",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        stack_path = distributions[args.name]
+
+        confirm = input(f"Are you sure you want to delete stack '{args.name}'? [y/N] ").lower()
+        if confirm != "y":
+            cprint("Deletion cancelled.", color="green", file=sys.stderr)
+            return
+
+        try:
+            shutil.rmtree(stack_path)
+            cprint(f"Successfully deleted stack: {args.name}", color="green", file=sys.stderr)
+        except Exception as e:
+            cprint(f"Failed to delete stack {args.name}: {e}", color="red", file=sys.stderr)
+            sys.exit(1)
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index d8234bb46..27745edac 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -6,6 +6,7 @@
 
 import argparse
 import os
+import subprocess
 from pathlib import Path
 
 from llama_stack.cli.stack.utils import ImageType
@@ -33,7 +34,8 @@ class StackRun(Subcommand):
         self.parser.add_argument(
             "config",
             type=str,
-            help="Path to config file to use for the run",
+            nargs="?",  # Make it optional
+            help="Path to config file to use for the run. Required for venv and conda environments.",
         )
         self.parser.add_argument(
             "--port",
@@ -47,34 +49,23 @@ class StackRun(Subcommand):
             default=os.environ.get("CONDA_DEFAULT_ENV"),
             help="Name of the image to run. Defaults to the current environment",
         )
-        self.parser.add_argument(
-            "--disable-ipv6",
-            action="store_true",
-            help="Disable IPv6 support",
-            default=False,
-        )
         self.parser.add_argument(
             "--env",
             action="append",
             help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
             metavar="KEY=VALUE",
         )
-        self.parser.add_argument(
-            "--tls-keyfile",
-            type=str,
-            help="Path to TLS key file for HTTPS",
-        )
-        self.parser.add_argument(
-            "--tls-certfile",
-            type=str,
-            help="Path to TLS certificate file for HTTPS",
-        )
         self.parser.add_argument(
             "--image-type",
             type=str,
             help="Image Type used during the build. This can be either conda or container or venv.",
             choices=[e.value for e in ImageType],
         )
+        self.parser.add_argument(
+            "--enable-ui",
+            action="store_true",
+            help="Start the UI server",
+        )
 
     # If neither image type nor image name is provided, but at the same time
     # the current environment has conda breadcrumbs, then assume what the user
@@ -98,44 +89,57 @@ class StackRun(Subcommand):
         from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
         from llama_stack.distribution.utils.exec import formulate_run_args, run_command
 
-        config_file = Path(args.config)
-        has_yaml_suffix = args.config.endswith(".yaml")
-        template_name = None
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if this is a template
-            config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
-            if config_file.exists():
-                template_name = args.config
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if it's a build config saved to ~/.llama dir
-            config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
-
-        if not config_file.exists():
-            self.parser.error(
-                f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
-            )
-
-        if not config_file.is_file():
-            self.parser.error(
-                f"Config file must be a valid file path, '{config_file}’ is not a file: type={type(config_file)}"
-            )
-
-        logger.info(f"Using run configuration: {config_file}")
-
-        try:
-            config_dict = yaml.safe_load(config_file.read_text())
-        except yaml.parser.ParserError as e:
-            self.parser.error(f"failed to load config file '{config_file}':\n {e}")
-
-        try:
-            config = parse_and_maybe_upgrade_config(config_dict)
-        except AttributeError as e:
-            self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
-
+        if args.enable_ui:
+            self._start_ui_development_server(args.port)
         image_type, image_name = self._get_image_type_and_name(args)
 
+        # Check if config is required based on image type
+        if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not args.config:
+            self.parser.error("Config file is required for venv and conda environments")
+
+        if args.config:
+            config_file = Path(args.config)
+            has_yaml_suffix = args.config.endswith(".yaml")
+            template_name = None
+
+            if not config_file.exists() and not has_yaml_suffix:
+                # check if this is a template
+                config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
+                if config_file.exists():
+                    template_name = args.config
+
+            if not config_file.exists() and not has_yaml_suffix:
+                # check if it's a build config saved to ~/.llama dir
+                config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
+
+            if not config_file.exists():
+                self.parser.error(
+                    f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
+                )
+
+            if not config_file.is_file():
+                self.parser.error(
+                    f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
+                )
+
+            logger.info(f"Using run configuration: {config_file}")
+
+            try:
+                config_dict = yaml.safe_load(config_file.read_text())
+            except yaml.parser.ParserError as e:
+                self.parser.error(f"failed to load config file '{config_file}':\n {e}")
+
+            try:
+                config = parse_and_maybe_upgrade_config(config_dict)
+                if not os.path.exists(str(config.external_providers_dir)):
+                    os.makedirs(str(config.external_providers_dir), exist_ok=True)
+            except AttributeError as e:
+                self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
+        else:
+            config = None
+            config_file = None
+            template_name = None
+
         # If neither image type nor image name is provided, assume the server should be run directly
         # using the current environment packages.
         if not image_type and not image_name:
@@ -157,9 +161,10 @@ class StackRun(Subcommand):
         else:
             run_args = formulate_run_args(image_type, image_name, config, template_name)
 
-            run_args.extend([str(config_file), str(args.port)])
-            if args.disable_ipv6:
-                run_args.append("--disable-ipv6")
+            run_args.extend([str(args.port)])
+
+            if config_file:
+                run_args.extend(["--config", str(config_file)])
 
             if args.env:
                 for env_var in args.env:
@@ -172,6 +177,45 @@ class StackRun(Subcommand):
                         return
                     run_args.extend(["--env", f"{key}={value}"])
 
-            if args.tls_keyfile and args.tls_certfile:
-                run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
             run_command(run_args)
+
+    def _start_ui_development_server(self, stack_server_port: int):
+        logger.info("Attempting to start UI development server...")
+        # Check if npm is available
+        npm_check = subprocess.run(["npm", "--version"], capture_output=True, text=True, check=False)
+        if npm_check.returncode != 0:
+            logger.warning(
+                f"'npm' command not found or not executable. UI development server will not be started. Error: {npm_check.stderr}"
+            )
+            return
+
+        ui_dir = REPO_ROOT / "llama_stack" / "ui"
+        logs_dir = Path("~/.llama/ui/logs").expanduser()
+        try:
+            # Create logs directory if it doesn't exist
+            logs_dir.mkdir(parents=True, exist_ok=True)
+
+            ui_stdout_log_path = logs_dir / "stdout.log"
+            ui_stderr_log_path = logs_dir / "stderr.log"
+
+            # Open log files in append mode
+            stdout_log_file = open(ui_stdout_log_path, "a")
+            stderr_log_file = open(ui_stderr_log_path, "a")
+
+            process = subprocess.Popen(
+                ["npm", "run", "dev"],
+                cwd=str(ui_dir),
+                stdout=stdout_log_file,
+                stderr=stderr_log_file,
+                env={**os.environ, "NEXT_PUBLIC_LLAMA_STACK_BASE_URL": f"http://localhost:{stack_server_port}"},
+            )
+            logger.info(f"UI development server process started in {ui_dir} with PID {process.pid}.")
+            logger.info(f"Logs: stdout -> {ui_stdout_log_path}, stderr -> {ui_stderr_log_path}")
+            logger.info(f"UI will be available at http://localhost:{os.getenv('LLAMA_STACK_UI_PORT', 8322)}")
+
+        except FileNotFoundError:
+            logger.error(
+                "Failed to start UI development server: 'npm' command not found. Make sure npm is installed and in your PATH."
+            )
+        except Exception as e:
+            logger.error(f"Failed to start UI development server in {ui_dir}: {e}")
diff --git a/llama_stack/cli/stack/stack.py b/llama_stack/cli/stack/stack.py
index ccf1a5ffc..3aff78e23 100644
--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@@ -7,12 +7,14 @@
 import argparse
 from importlib.metadata import version
 
+from llama_stack.cli.stack.list_stacks import StackListBuilds
 from llama_stack.cli.stack.utils import print_subcommand_description
 from llama_stack.cli.subcommand import Subcommand
 
 from .build import StackBuild
 from .list_apis import StackListApis
 from .list_providers import StackListProviders
+from .remove import StackRemove
 from .run import StackRun
 
 
@@ -41,5 +43,6 @@ class StackParser(Subcommand):
         StackListApis.create(subparsers)
         StackListProviders.create(subparsers)
         StackRun.create(subparsers)
-
+        StackRemove.create(subparsers)
+        StackListBuilds.create(subparsers)
         print_subcommand_description(self.parser, subparsers)
diff --git a/llama_stack/cli/table.py b/llama_stack/cli/table.py
index bf59e6103..86c3adff2 100644
--- a/llama_stack/cli/table.py
+++ b/llama_stack/cli/table.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Iterable
+from collections.abc import Iterable
 
 from rich.console import Console
 from rich.table import Table
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index 1229e8601..3a1af3cbc 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -9,7 +9,6 @@ import hashlib
 from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Optional
 
 from rich.console import Console
 from rich.progress import Progress, SpinnerColumn, TextColumn
@@ -21,7 +20,7 @@ from llama_stack.cli.subcommand import Subcommand
 class VerificationResult:
     filename: str
     expected_hash: str
-    actual_hash: Optional[str]
+    actual_hash: str | None
     exists: bool
     matches: bool
 
@@ -60,9 +59,9 @@ def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
     return md5_hash.hexdigest()
 
 
-def load_checksums(checklist_path: Path) -> Dict[str, str]:
+def load_checksums(checklist_path: Path) -> dict[str, str]:
     checksums = {}
-    with open(checklist_path, "r") as f:
+    with open(checklist_path) as f:
         for line in f:
             if line.strip():
                 md5sum, filepath = line.strip().split("  ", 1)
@@ -72,7 +71,7 @@ def load_checksums(checklist_path: Path) -> Dict[str, str]:
     return checksums
 
 
-def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -> List[VerificationResult]:
+def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -> list[VerificationResult]:
     results = []
 
     with Progress(
diff --git a/llama_stack/distribution/access_control.py b/llama_stack/distribution/access_control.py
index 0651ab6eb..d560ec80f 100644
--- a/llama_stack/distribution/access_control.py
+++ b/llama_stack/distribution/access_control.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.log import get_logger
@@ -14,8 +14,8 @@ logger = get_logger(__name__, category="core")
 
 def check_access(
     obj_identifier: str,
-    obj_attributes: Optional[AccessAttributes],
-    user_attributes: Optional[Dict[str, Any]] = None,
+    obj_attributes: AccessAttributes | None,
+    user_attributes: dict[str, Any] | None = None,
 ) -> bool:
     """Check if the current user has access to the given object, based on access attributes.
 
diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py
index 9664449f3..072f9c425 100644
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@@ -6,6 +6,7 @@
 
 import importlib.resources
 import logging
+import sys
 from pathlib import Path
 
 from pydantic import BaseModel
@@ -43,18 +44,29 @@ def get_provider_dependencies(
     # Extract providers based on config type
     if isinstance(config, DistributionTemplate):
         providers = config.providers
+
+        # TODO: This is a hack to get the dependencies for internal APIs into build
+        # We should have a better way to do this by formalizing the concept of "internal" APIs
+        # and providers, with a way to specify dependencies for them.
+        run_configs = config.run_configs
+        additional_pip_packages: list[str] = []
+        if run_configs:
+            for run_config in run_configs.values():
+                run_config_ = run_config.run_config(name="", providers={}, container_image=None)
+                if run_config_.inference_store:
+                    additional_pip_packages.extend(run_config_.inference_store.pip_packages)
     elif isinstance(config, BuildConfig):
         providers = config.distribution_spec.providers
+        additional_pip_packages = config.additional_pip_packages
     deps = []
     registry = get_provider_registry(config)
-
     for api_str, provider_or_providers in providers.items():
         providers_for_api = registry[Api(api_str)]
 
         providers = provider_or_providers if isinstance(provider_or_providers, list) else [provider_or_providers]
 
         for provider in providers:
-            # Providers from BuildConfig and RunConfig are subtly different – not great
+            # Providers from BuildConfig and RunConfig are subtly different - not great
             provider_type = provider if isinstance(provider, str) else provider.provider_type
 
             if provider_type not in providers_for_api:
@@ -73,6 +85,9 @@ def get_provider_dependencies(
         else:
             normal_deps.append(package)
 
+    if additional_pip_packages:
+        normal_deps.extend(additional_pip_packages)
+
     return list(set(normal_deps)), list(set(special_deps))
 
 
@@ -81,10 +96,11 @@ def print_pip_install_help(config: BuildConfig):
 
     cprint(
         f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
-        "yellow",
+        color="yellow",
+        file=sys.stderr,
     )
     for special_dep in special_deps:
-        cprint(f"uv pip install {special_dep}", "yellow")
+        cprint(f"uv pip install {special_dep}", color="yellow", file=sys.stderr)
     print()
 
 
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index ad316d45e..c128729e1 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -154,6 +154,12 @@ get_python_cmd() {
     fi
 }
 
+# Add other required item commands generic to all containers
+add_to_container << EOF
+# Allows running as non-root user
+RUN mkdir -p /.llama/providers.d /.cache
+EOF
+
 if [ -n "$run_config" ]; then
   # Copy the run config to the build context since it's an absolute path
   cp "$run_config" "$BUILD_CONTEXT_DIR/run.yaml"
@@ -166,17 +172,19 @@ EOF
   # and update the configuration to reference the new container path
   python_cmd=$(get_python_cmd)
   external_providers_dir=$($python_cmd -c "import yaml; config = yaml.safe_load(open('$run_config')); print(config.get('external_providers_dir') or '')")
-  if [ -n "$external_providers_dir" ]; then
+  external_providers_dir=$(eval echo "$external_providers_dir")
+  if [ -n "$external_providers_dir" ] && [ -d "$external_providers_dir" ]; then
     echo "Copying external providers directory: $external_providers_dir"
+    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
     add_to_container << EOF
-COPY $external_providers_dir /app/providers.d
+COPY providers.d /.llama/providers.d
 EOF
-    # Edit the run.yaml file to change the external_providers_dir to /app/providers.d
+    # Edit the run.yaml file to change the external_providers_dir to /.llama/providers.d
     if [ "$(uname)" = "Darwin" ]; then
-      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
       rm -f "$BUILD_CONTEXT_DIR/run.yaml.bak"
     else
-      sed -i 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      sed -i 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
     fi
   fi
 fi
@@ -255,9 +263,6 @@ fi
 # Add other require item commands genearic to all containers
 add_to_container << EOF
 
-# Allows running as non-root user
-RUN mkdir -p /.llama /.cache
-
 RUN chmod -R g+rw /app /.llama /.cache
 EOF
 
diff --git a/llama_stack/distribution/client.py b/llama_stack/distribution/client.py
index 1925b864f..03e4fb051 100644
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@@ -6,9 +6,10 @@
 
 import inspect
 import json
+import sys
 from collections.abc import AsyncIterator
 from enum import Enum
-from typing import Any, Type, Union, get_args, get_origin
+from typing import Any, Union, get_args, get_origin
 
 import httpx
 from pydantic import BaseModel, parse_obj_as
@@ -27,7 +28,7 @@ async def get_client_impl(protocol, config: RemoteProviderConfig, _deps: Any):
     return impl
 
 
-def create_api_client_class(protocol) -> Type:
+def create_api_client_class(protocol) -> type:
     if protocol in _CLIENT_CLASSES:
         return _CLIENT_CLASSES[protocol]
 
@@ -96,13 +97,13 @@ def create_api_client_class(protocol) -> Type:
                             try:
                                 data = json.loads(data)
                                 if "error" in data:
-                                    cprint(data, "red")
+                                    cprint(data, color="red", file=sys.stderr)
                                     continue
 
                                 yield parse_obj_as(return_type, data)
                             except Exception as e:
-                                print(f"Error with parsing or validation: {e}")
-                                print(data)
+                                cprint(f"Error with parsing or validation: {e}", color="red", file=sys.stderr)
+                                cprint(data, color="red", file=sys.stderr)
 
         def httpx_request_params(self, method_name: str, *args, **kwargs) -> dict:
             webmethod, sig = self.routes[method_name]
diff --git a/llama_stack/distribution/common.sh b/llama_stack/distribution/common.sh
index 15220048b..5f764bcca 100755
--- a/llama_stack/distribution/common.sh
+++ b/llama_stack/distribution/common.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
diff --git a/llama_stack/distribution/configure.py b/llama_stack/distribution/configure.py
index 2a3bf7053..e58ea0338 100644
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import logging
 import textwrap
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import (
     LLAMA_STACK_RUN_CONFIG_VERSION,
@@ -17,6 +17,7 @@ from llama_stack.distribution.distribution import (
     builtin_automatically_routed_apis,
     get_provider_registry,
 )
+from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
 from llama_stack.providers.datatypes import Api, ProviderSpec
@@ -24,7 +25,7 @@ from llama_stack.providers.datatypes import Api, ProviderSpec
 logger = logging.getLogger(__name__)
 
 
-def configure_single_provider(registry: Dict[str, ProviderSpec], provider: Provider) -> Provider:
+def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
     provider_spec = registry[provider.provider_type]
     config_type = instantiate_class_type(provider_spec.config_class)
     try:
@@ -73,11 +74,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
 
         existing_providers = config.providers.get(api_str, [])
         if existing_providers:
-            logger.info(
-                f"Re-configuring existing providers for API `{api_str}`...",
-                "green",
-                attrs=["bold"],
-            )
+            logger.info(f"Re-configuring existing providers for API `{api_str}`...")
             updated_providers = []
             for p in existing_providers:
                 logger.info(f"> Configuring provider `({p.provider_type})`")
@@ -91,7 +88,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
             if not plist:
                 raise ValueError(f"No provider configured for API {api_str}?")
 
-            logger.info(f"Configuring API `{api_str}`...", "green", attrs=["bold"])
+            logger.info(f"Configuring API `{api_str}`...")
             updated_providers = []
             for i, provider_type in enumerate(plist):
                 if i >= 1:
@@ -120,8 +117,8 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
 
 
 def upgrade_from_routing_table(
-    config_dict: Dict[str, Any],
-) -> Dict[str, Any]:
+    config_dict: dict[str, Any],
+) -> dict[str, Any]:
     def get_providers(entries):
         return [
             Provider(
@@ -163,7 +160,7 @@ def upgrade_from_routing_table(
     return config_dict
 
 
-def parse_and_maybe_upgrade_config(config_dict: Dict[str, Any]) -> StackRunConfig:
+def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
     version = config_dict.get("version", None)
     if version == LLAMA_STACK_RUN_CONFIG_VERSION:
         return StackRunConfig(**config_dict)
@@ -174,4 +171,7 @@ def parse_and_maybe_upgrade_config(config_dict: Dict[str, Any]) -> StackRunConfi
 
     config_dict["version"] = LLAMA_STACK_RUN_CONFIG_VERSION
 
+    if not config_dict.get("external_providers_dir", None):
+        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR
+
     return StackRunConfig(**config_dict)
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 38353c1ff..def7048c0 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -4,9 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Annotated, Any, Dict, List, Optional, Union
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
 from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
@@ -23,13 +25,14 @@ from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.datatypes import Api, ProviderSpec
-from llama_stack.providers.utils.kvstore.config import KVStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import SqlStoreConfig
 
 LLAMA_STACK_BUILD_CONFIG_VERSION = "2"
 LLAMA_STACK_RUN_CONFIG_VERSION = "2"
 
 
-RoutingKey = Union[str, List[str]]
+RoutingKey = str | list[str]
 
 
 class AccessAttributes(BaseModel):
@@ -46,17 +49,17 @@ class AccessAttributes(BaseModel):
     """
 
     # Standard attribute categories - the minimal set we need now
-    roles: Optional[List[str]] = Field(
+    roles: list[str] | None = Field(
         default=None, description="Role-based attributes (e.g., 'admin', 'data-scientist', 'user')"
     )
 
-    teams: Optional[List[str]] = Field(default=None, description="Team-based attributes (e.g., 'ml-team', 'nlp-team')")
+    teams: list[str] | None = Field(default=None, description="Team-based attributes (e.g., 'ml-team', 'nlp-team')")
 
-    projects: Optional[List[str]] = Field(
+    projects: list[str] | None = Field(
         default=None, description="Project-based access attributes (e.g., 'llama-3', 'customer-insights')"
     )
 
-    namespaces: Optional[List[str]] = Field(
+    namespaces: list[str] | None = Field(
         default=None, description="Namespace-based access control for resource isolation"
     )
 
@@ -105,7 +108,7 @@ class ResourceWithACL(Resource):
         # ^ User must have access to the customer-insights project AND have confidential namespace
     """
 
-    access_attributes: Optional[AccessAttributes] = None
+    access_attributes: AccessAttributes | None = None
 
 
 # Use the extended Resource for all routable objects
@@ -141,41 +144,21 @@ class ToolGroupWithACL(ToolGroup, ResourceWithACL):
     pass
 
 
-RoutableObject = Union[
-    Model,
-    Shield,
-    VectorDB,
-    Dataset,
-    ScoringFn,
-    Benchmark,
-    Tool,
-    ToolGroup,
-]
-
+RoutableObject = Model | Shield | VectorDB | Dataset | ScoringFn | Benchmark | Tool | ToolGroup
 
 RoutableObjectWithProvider = Annotated[
-    Union[
-        ModelWithACL,
-        ShieldWithACL,
-        VectorDBWithACL,
-        DatasetWithACL,
-        ScoringFnWithACL,
-        BenchmarkWithACL,
-        ToolWithACL,
-        ToolGroupWithACL,
-    ],
+    ModelWithACL
+    | ShieldWithACL
+    | VectorDBWithACL
+    | DatasetWithACL
+    | ScoringFnWithACL
+    | BenchmarkWithACL
+    | ToolWithACL
+    | ToolGroupWithACL,
     Field(discriminator="type"),
 ]
 
-RoutedProtocol = Union[
-    Inference,
-    Safety,
-    VectorIO,
-    DatasetIO,
-    Scoring,
-    Eval,
-    ToolRuntime,
-]
+RoutedProtocol = Inference | Safety | VectorIO | DatasetIO | Scoring | Eval | ToolRuntime
 
 
 # Example: /inference, /safety
@@ -183,15 +166,15 @@ class AutoRoutedProviderSpec(ProviderSpec):
     provider_type: str = "router"
     config_class: str = ""
 
-    container_image: Optional[str] = None
+    container_image: str | None = None
     routing_table_api: Api
     module: str
-    provider_data_validator: Optional[str] = Field(
+    provider_data_validator: str | None = Field(
         default=None,
     )
 
     @property
-    def pip_packages(self) -> List[str]:
+    def pip_packages(self) -> list[str]:
         raise AssertionError("Should not be called on AutoRoutedProviderSpec")
 
 
@@ -199,20 +182,20 @@ class AutoRoutedProviderSpec(ProviderSpec):
 class RoutingTableProviderSpec(ProviderSpec):
     provider_type: str = "routing_table"
     config_class: str = ""
-    container_image: Optional[str] = None
+    container_image: str | None = None
 
     router_api: Api
     module: str
-    pip_packages: List[str] = Field(default_factory=list)
+    pip_packages: list[str] = Field(default_factory=list)
 
 
 class DistributionSpec(BaseModel):
-    description: Optional[str] = Field(
+    description: str | None = Field(
         default="",
         description="Description of the distribution",
     )
-    container_image: Optional[str] = None
-    providers: Dict[str, Union[str, List[str]]] = Field(
+    container_image: str | None = None
+    providers: dict[str, str | list[str]] = Field(
         default_factory=dict,
         description="""
 Provider Types for each of the APIs provided by this distribution. If you
@@ -224,22 +207,50 @@ in the runtime configuration to help route to the correct provider.""",
 class Provider(BaseModel):
     provider_id: str
     provider_type: str
-    config: Dict[str, Any]
+    config: dict[str, Any]
 
 
 class LoggingConfig(BaseModel):
-    category_levels: Dict[str, str] = Field(
-        default_factory=Dict,
+    category_levels: dict[str, str] = Field(
+        default_factory=dict,
         description="""
  Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
     )
 
 
+class AuthProviderType(str, Enum):
+    """Supported authentication provider types."""
+
+    OAUTH2_TOKEN = "oauth2_token"
+    CUSTOM = "custom"
+
+
 class AuthenticationConfig(BaseModel):
-    endpoint: str = Field(
+    provider_type: AuthProviderType = Field(
         ...,
-        description="Endpoint URL to validate authentication tokens",
+        description="Type of authentication provider",
     )
+    config: dict[str, Any] = Field(
+        ...,
+        description="Provider-specific configuration",
+    )
+
+
+class AuthenticationRequiredError(Exception):
+    pass
+
+
+class QuotaPeriod(str, Enum):
+    DAY = "day"
+
+
+class QuotaConfig(BaseModel):
+    kvstore: SqliteKVStoreConfig = Field(description="Config for KV store backend (SQLite only for now)")
+    anonymous_max_requests: int = Field(default=100, description="Max requests for unauthenticated clients per period")
+    authenticated_max_requests: int = Field(
+        default=1000, description="Max requests for authenticated clients per period"
+    )
+    period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
 
 
 class ServerConfig(BaseModel):
@@ -249,18 +260,30 @@ class ServerConfig(BaseModel):
         ge=1024,
         le=65535,
     )
-    tls_certfile: Optional[str] = Field(
+    tls_certfile: str | None = Field(
         default=None,
         description="Path to TLS certificate file for HTTPS",
     )
-    tls_keyfile: Optional[str] = Field(
+    tls_keyfile: str | None = Field(
         default=None,
         description="Path to TLS key file for HTTPS",
     )
-    auth: Optional[AuthenticationConfig] = Field(
+    tls_cafile: str | None = Field(
+        default=None,
+        description="Path to TLS CA file for HTTPS with mutual TLS authentication",
+    )
+    auth: AuthenticationConfig | None = Field(
         default=None,
         description="Authentication configuration for the server",
     )
+    host: str | None = Field(
+        default=None,
+        description="The host the server should listen on",
+    )
+    quota: QuotaConfig | None = Field(
+        default=None,
+        description="Per client quota request configuration",
+    )
 
 
 class StackRunConfig(BaseModel):
@@ -273,50 +296,66 @@ Reference to the distribution this package refers to. For unregistered (adhoc) p
 this could be just a hash
 """,
     )
-    container_image: Optional[str] = Field(
+    container_image: str | None = Field(
         default=None,
         description="Reference to the container image if this package refers to a container",
     )
-    apis: List[str] = Field(
+    apis: list[str] = Field(
         default_factory=list,
         description="""
 The list of APIs to serve. If not specified, all APIs specified in the provider_map will be served""",
     )
 
-    providers: Dict[str, List[Provider]] = Field(
+    providers: dict[str, list[Provider]] = Field(
         description="""
 One or more providers to use for each API. The same provider_type (e.g., meta-reference)
 can be instantiated multiple times (with different configs) if necessary.
 """,
     )
-    metadata_store: Optional[KVStoreConfig] = Field(
+    metadata_store: KVStoreConfig | None = Field(
         default=None,
         description="""
 Configuration for the persistence store used by the distribution registry. If not specified,
 a default SQLite store will be used.""",
     )
 
-    # registry of "resources" in the distribution
-    models: List[ModelInput] = Field(default_factory=list)
-    shields: List[ShieldInput] = Field(default_factory=list)
-    vector_dbs: List[VectorDBInput] = Field(default_factory=list)
-    datasets: List[DatasetInput] = Field(default_factory=list)
-    scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
-    benchmarks: List[BenchmarkInput] = Field(default_factory=list)
-    tool_groups: List[ToolGroupInput] = Field(default_factory=list)
+    inference_store: SqlStoreConfig | None = Field(
+        default=None,
+        description="""
+Configuration for the persistence store used by the inference API. If not specified,
+a default SQLite store will be used.""",
+    )
 
-    logging: Optional[LoggingConfig] = Field(default=None, description="Configuration for Llama Stack Logging")
+    # registry of "resources" in the distribution
+    models: list[ModelInput] = Field(default_factory=list)
+    shields: list[ShieldInput] = Field(default_factory=list)
+    vector_dbs: list[VectorDBInput] = Field(default_factory=list)
+    datasets: list[DatasetInput] = Field(default_factory=list)
+    scoring_fns: list[ScoringFnInput] = Field(default_factory=list)
+    benchmarks: list[BenchmarkInput] = Field(default_factory=list)
+    tool_groups: list[ToolGroupInput] = Field(default_factory=list)
+
+    logging: LoggingConfig | None = Field(default=None, description="Configuration for Llama Stack Logging")
 
     server: ServerConfig = Field(
         default_factory=ServerConfig,
         description="Configuration for the HTTP(S) server",
     )
 
-    external_providers_dir: Optional[str] = Field(
+    external_providers_dir: Path | None = Field(
         default=None,
         description="Path to directory containing external provider implementations. The providers code and dependencies must be installed on the system.",
     )
 
+    @field_validator("external_providers_dir")
+    @classmethod
+    def validate_external_providers_dir(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return Path(v)
+        return v
+
 
 class BuildConfig(BaseModel):
     version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
@@ -326,12 +365,25 @@ class BuildConfig(BaseModel):
         default="conda",
         description="Type of package to build (conda | container | venv)",
     )
-    image_name: Optional[str] = Field(
+    image_name: str | None = Field(
         default=None,
         description="Name of the distribution to build",
     )
-    external_providers_dir: Optional[str] = Field(
+    external_providers_dir: Path | None = Field(
         default=None,
         description="Path to directory containing external provider implementations. The providers packages will be resolved from this directory. "
         "pip_packages MUST contain the provider package name.",
     )
+    additional_pip_packages: list[str] = Field(
+        default_factory=list,
+        description="Additional pip packages to install in the distribution. These packages will be installed in the distribution environment.",
+    )
+
+    @field_validator("external_providers_dir")
+    @classmethod
+    def validate_external_providers_dir(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return Path(v)
+        return v
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index f948ddf1c..b860d15ab 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -7,7 +7,7 @@
 import glob
 import importlib
 import os
-from typing import Any, Dict, List
+from typing import Any
 
 import yaml
 from pydantic import BaseModel
@@ -24,7 +24,7 @@ from llama_stack.providers.datatypes import (
 logger = get_logger(name=__name__, category="core")
 
 
-def stack_apis() -> List[Api]:
+def stack_apis() -> list[Api]:
     return list(Api)
 
 
@@ -33,7 +33,7 @@ class AutoRoutedApiInfo(BaseModel):
     router_api: Api
 
 
-def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
+def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
     return [
         AutoRoutedApiInfo(
             routing_table_api=Api.models,
@@ -66,12 +66,12 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
     ]
 
 
-def providable_apis() -> List[Api]:
+def providable_apis() -> list[Api]:
     routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
     return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers]
 
 
-def _load_remote_provider_spec(spec_data: Dict[str, Any], api: Api) -> ProviderSpec:
+def _load_remote_provider_spec(spec_data: dict[str, Any], api: Api) -> ProviderSpec:
     adapter = AdapterSpec(**spec_data["adapter"])
     spec = remote_provider_spec(
         api=api,
@@ -81,7 +81,7 @@ def _load_remote_provider_spec(spec_data: Dict[str, Any], api: Api) -> ProviderS
     return spec
 
 
-def _load_inline_provider_spec(spec_data: Dict[str, Any], api: Api, provider_name: str) -> ProviderSpec:
+def _load_inline_provider_spec(spec_data: dict[str, Any], api: Api, provider_name: str) -> ProviderSpec:
     spec = InlineProviderSpec(
         api=api,
         provider_type=f"inline::{provider_name}",
@@ -98,7 +98,7 @@ def _load_inline_provider_spec(spec_data: Dict[str, Any], api: Api, provider_nam
 
 def get_provider_registry(
     config=None,
-) -> Dict[Api, Dict[str, ProviderSpec]]:
+) -> dict[Api, dict[str, ProviderSpec]]:
     """Get the provider registry, optionally including external providers.
 
     This function loads both built-in providers and external providers from YAML files.
@@ -133,7 +133,7 @@ def get_provider_registry(
         ValueError: If any provider spec is invalid
     """
 
-    ret: Dict[Api, Dict[str, ProviderSpec]] = {}
+    ret: dict[Api, dict[str, ProviderSpec]] = {}
     for api in providable_apis():
         name = api.name.lower()
         logger.debug(f"Importing module {name}")
@@ -145,7 +145,7 @@ def get_provider_registry(
 
     # Check if config has the external_providers_dir attribute
     if config and hasattr(config, "external_providers_dir") and config.external_providers_dir:
-        external_providers_dir = os.path.abspath(config.external_providers_dir)
+        external_providers_dir = os.path.abspath(os.path.expanduser(config.external_providers_dir))
         if not os.path.exists(external_providers_dir):
             raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
         logger.info(f"Loading external providers from {external_providers_dir}")
diff --git a/llama_stack/distribution/inspect.py b/llama_stack/distribution/inspect.py
index 23f644ec6..5822070ad 100644
--- a/llama_stack/distribution/inspect.py
+++ b/llama_stack/distribution/inspect.py
@@ -16,7 +16,7 @@ from llama_stack.apis.inspect import (
     VersionInfo,
 )
 from llama_stack.distribution.datatypes import StackRunConfig
-from llama_stack.distribution.server.endpoints import get_all_api_endpoints
+from llama_stack.distribution.server.routes import get_all_api_routes
 from llama_stack.providers.datatypes import HealthStatus
 
 
@@ -31,7 +31,7 @@ async def get_provider_impl(config, deps):
 
 
 class DistributionInspectImpl(Inspect):
-    def __init__(self, config, deps):
+    def __init__(self, config: DistributionInspectConfig, deps):
         self.config = config
         self.deps = deps
 
@@ -39,22 +39,36 @@ class DistributionInspectImpl(Inspect):
         pass
 
     async def list_routes(self) -> ListRoutesResponse:
-        run_config = self.config.run_config
+        run_config: StackRunConfig = self.config.run_config
 
         ret = []
-        all_endpoints = get_all_api_endpoints()
+        all_endpoints = get_all_api_routes()
         for api, endpoints in all_endpoints.items():
-            providers = run_config.providers.get(api.value, [])
-            ret.extend(
-                [
-                    RouteInfo(
-                        route=e.route,
-                        method=e.method,
-                        provider_types=[p.provider_type for p in providers],
+            # Always include provider and inspect APIs, filter others based on run config
+            if api.value in ["providers", "inspect"]:
+                ret.extend(
+                    [
+                        RouteInfo(
+                            route=e.path,
+                            method=next(iter([m for m in e.methods if m != "HEAD"])),
+                            provider_types=[],  # These APIs don't have "real" providers - they're internal to the stack
+                        )
+                        for e in endpoints
+                    ]
+                )
+            else:
+                providers = run_config.providers.get(api.value, [])
+                if providers:  # Only process if there are providers for this API
+                    ret.extend(
+                        [
+                            RouteInfo(
+                                route=e.path,
+                                method=next(iter([m for m in e.methods if m != "HEAD"])),
+                                provider_types=[p.provider_type for p in providers],
+                            )
+                            for e in endpoints
+                        ]
                     )
-                    for e in endpoints
-                ]
-            )
 
         return ListRoutesResponse(data=ret)
 
diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index f426bcafe..f32130cf9 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -9,10 +9,11 @@ import inspect
 import json
 import logging
 import os
+import sys
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from pathlib import Path
-from typing import Any, Optional, TypeVar, Union, get_args, get_origin
+from typing import Any, TypeVar, Union, get_args, get_origin
 
 import httpx
 import yaml
@@ -30,16 +31,13 @@ from termcolor import cprint
 
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-from llama_stack.distribution.datatypes import Api
+from llama_stack.distribution.datatypes import Api, BuildConfig, DistributionSpec
 from llama_stack.distribution.request_headers import (
     PROVIDER_DATA_VAR,
     request_provider_data_context,
 )
 from llama_stack.distribution.resolver import ProviderRegistry
-from llama_stack.distribution.server.endpoints import (
-    find_matching_endpoint,
-    initialize_endpoint_impls,
-)
+from llama_stack.distribution.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.distribution.stack import (
     construct_stack,
     get_stack_run_config_from_template,
@@ -119,8 +117,8 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
         self,
         config_path_or_template_name: str,
         skip_logger_removal: bool = False,
-        custom_provider_registry: Optional[ProviderRegistry] = None,
-        provider_data: Optional[dict[str, Any]] = None,
+        custom_provider_registry: ProviderRegistry | None = None,
+        provider_data: dict[str, Any] | None = None,
     ):
         super().__init__()
         self.async_client = AsyncLlamaStackAsLibraryClient(
@@ -181,8 +179,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
     def __init__(
         self,
         config_path_or_template_name: str,
-        custom_provider_registry: Optional[ProviderRegistry] = None,
-        provider_data: Optional[dict[str, Any]] = None,
+        custom_provider_registry: ProviderRegistry | None = None,
+        provider_data: dict[str, Any] | None = None,
     ):
         super().__init__()
         # when using the library client, we should not log to console since many
@@ -207,22 +205,41 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
 
     async def initialize(self) -> bool:
         try:
-            self.endpoint_impls = None
+            self.route_impls = None
             self.impls = await construct_stack(self.config, self.custom_provider_registry)
         except ModuleNotFoundError as _e:
-            cprint(_e.msg, "red")
+            cprint(_e.msg, color="red", file=sys.stderr)
             cprint(
                 "Using llama-stack as a library requires installing dependencies depending on the template (providers) you choose.\n",
-                "yellow",
+                color="yellow",
+                file=sys.stderr,
             )
             if self.config_path_or_template_name.endswith(".yaml"):
-                print_pip_install_help(self.config.providers)
+                # Convert Provider objects to their types
+                provider_types: dict[str, str | list[str]] = {}
+                for api, providers in self.config.providers.items():
+                    types = [p.provider_type for p in providers]
+                    # Convert single-item lists to strings
+                    provider_types[api] = types[0] if len(types) == 1 else types
+                build_config = BuildConfig(
+                    distribution_spec=DistributionSpec(
+                        providers=provider_types,
+                    ),
+                    external_providers_dir=self.config.external_providers_dir,
+                )
+                print_pip_install_help(build_config)
             else:
                 prefix = "!" if in_notebook() else ""
                 cprint(
                     f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
                     "yellow",
+                    file=sys.stderr,
                 )
+            cprint(
+                "Please check your internet connection and try again.",
+                "red",
+                file=sys.stderr,
+            )
             raise _e
 
         if Api.telemetry in self.impls:
@@ -234,7 +251,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
             safe_config = redact_sensitive_fields(self.config.model_dump())
             console.print(yaml.dump(safe_config, indent=2))
 
-        self.endpoint_impls = initialize_endpoint_impls(self.impls)
+        self.route_impls = initialize_route_impls(self.impls)
         return True
 
     async def request(
@@ -245,13 +262,15 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         stream=False,
         stream_cls=None,
     ):
-        if not self.endpoint_impls:
+        if not self.route_impls:
             raise ValueError("Client not initialized")
 
         # Create headers with provider data if available
-        headers = {}
+        headers = options.headers or {}
         if self.provider_data:
-            headers["X-LlamaStack-Provider-Data"] = json.dumps(self.provider_data)
+            keys = ["X-LlamaStack-Provider-Data", "x-llamastack-provider-data"]
+            if all(key not in headers for key in keys):
+                headers["X-LlamaStack-Provider-Data"] = json.dumps(self.provider_data)
 
         # Use context manager for provider data
         with request_provider_data_context(headers):
@@ -274,11 +293,14 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         cast_to: Any,
         options: Any,
     ):
+        if self.route_impls is None:
+            raise ValueError("Client not initialized")
+
         path = options.url
         body = options.params or {}
         body |= options.json_data or {}
 
-        matched_func, path_params, route = find_matching_endpoint(options.method, path, self.endpoint_impls)
+        matched_func, path_params, route = find_matching_route(options.method, path, self.route_impls)
         body |= path_params
         body = self._convert_body(path, options.method, body)
         await start_trace(route, {"__location__": "library_client"})
@@ -320,10 +342,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         options: Any,
         stream_cls: Any,
     ):
+        if self.route_impls is None:
+            raise ValueError("Client not initialized")
+
         path = options.url
         body = options.params or {}
         body |= options.json_data or {}
-        func, path_params, route = find_matching_endpoint(options.method, path, self.endpoint_impls)
+        func, path_params, route = find_matching_route(options.method, path, self.route_impls)
         body |= path_params
 
         body = self._convert_body(path, options.method, body)
@@ -371,11 +396,14 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         )
         return await response.parse()
 
-    def _convert_body(self, path: str, method: str, body: Optional[dict] = None) -> dict:
+    def _convert_body(self, path: str, method: str, body: dict | None = None) -> dict:
         if not body:
             return {}
 
-        func, _, _ = find_matching_endpoint(method, path, self.endpoint_impls)
+        if self.route_impls is None:
+            raise ValueError("Client not initialized")
+
+        func, _, _ = find_matching_route(method, path, self.route_impls)
         sig = inspect.signature(func)
 
         # Strip NOT_GIVENs to use the defaults in signature
diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py
index 1c00ce264..29b7109dd 100644
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import asyncio
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -73,14 +73,14 @@ class ProviderImpl(Providers):
 
         raise ValueError(f"Provider {provider_id} not found")
 
-    async def get_providers_health(self) -> Dict[str, Dict[str, HealthResponse]]:
+    async def get_providers_health(self) -> dict[str, dict[str, HealthResponse]]:
         """Get health status for all providers.
 
         Returns:
             Dict[str, Dict[str, HealthResponse]]: A dictionary mapping API names to provider health statuses.
                 Each API maps to a dictionary of provider IDs to their health responses.
         """
-        providers_health: Dict[str, Dict[str, HealthResponse]] = {}
+        providers_health: dict[str, dict[str, HealthResponse]] = {}
         timeout = 1.0
 
         async def check_provider_health(impl: Any) -> tuple[str, HealthResponse] | None:
@@ -99,7 +99,7 @@ class ProviderImpl(Providers):
             try:
                 health = await asyncio.wait_for(impl.health(), timeout=timeout)
                 return api_name, health
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 return (
                     api_name,
                     HealthResponse(
diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py
index f9cde2cdf..b03d2dee8 100644
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@@ -7,7 +7,8 @@
 import contextvars
 import json
 import logging
-from typing import Any, ContextManager, Dict, List, Optional
+from contextlib import AbstractContextManager
+from typing import Any
 
 from .utils.dynamic import instantiate_class_type
 
@@ -17,11 +18,11 @@ log = logging.getLogger(__name__)
 PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
 
 
-class RequestProviderDataContext(ContextManager):
+class RequestProviderDataContext(AbstractContextManager):
     """Context manager for request provider data"""
 
     def __init__(
-        self, provider_data: Optional[Dict[str, Any]] = None, auth_attributes: Optional[Dict[str, List[str]]] = None
+        self, provider_data: dict[str, Any] | None = None, auth_attributes: dict[str, list[str]] | None = None
     ):
         self.provider_data = provider_data or {}
         if auth_attributes:
@@ -43,7 +44,8 @@ class RequestProviderDataContext(ContextManager):
 class NeedsRequestProviderData:
     def get_request_provider_data(self) -> Any:
         spec = self.__provider_spec__
-        assert spec, f"Provider spec not set on {self.__class__}"
+        if not spec:
+            raise ValueError(f"Provider spec not set on {self.__class__}")
 
         provider_type = spec.provider_type
         validator_class = spec.provider_data_validator
@@ -63,7 +65,7 @@ class NeedsRequestProviderData:
             return None
 
 
-def parse_request_provider_data(headers: Dict[str, str]) -> Optional[Dict[str, Any]]:
+def parse_request_provider_data(headers: dict[str, str]) -> dict[str, Any] | None:
     """Parse provider data from request headers"""
     keys = [
         "X-LlamaStack-Provider-Data",
@@ -86,14 +88,14 @@ def parse_request_provider_data(headers: Dict[str, str]) -> Optional[Dict[str, A
 
 
 def request_provider_data_context(
-    headers: Dict[str, str], auth_attributes: Optional[Dict[str, List[str]]] = None
-) -> ContextManager:
+    headers: dict[str, str], auth_attributes: dict[str, list[str]] | None = None
+) -> AbstractContextManager:
     """Context manager that sets request provider data from headers and auth attributes for the duration of the context"""
     provider_data = parse_request_provider_data(headers)
     return RequestProviderDataContext(provider_data, auth_attributes)
 
 
-def get_auth_attributes() -> Optional[Dict[str, List[str]]]:
+def get_auth_attributes() -> dict[str, list[str]] | None:
     """Helper to retrieve auth attributes from the provider data context"""
     provider_data = PROVIDER_DATA_VAR.get()
     if not provider_data:
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index e9a594eba..b7c7cb87f 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import importlib
 import inspect
-from typing import Any, Dict, List, Set, Tuple
+from typing import Any
 
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.benchmarks import Benchmarks
@@ -13,7 +13,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
@@ -47,7 +47,7 @@ from llama_stack.providers.datatypes import (
     RemoteProviderSpec,
     ScoringFunctionsProtocolPrivate,
     ShieldsProtocolPrivate,
-    ToolsProtocolPrivate,
+    ToolGroupsProtocolPrivate,
     VectorDBsProtocolPrivate,
 )
 
@@ -58,7 +58,7 @@ class InvalidProviderError(Exception):
     pass
 
 
-def api_protocol_map() -> Dict[Api, Any]:
+def api_protocol_map() -> dict[Api, Any]:
     return {
         Api.providers: ProvidersAPI,
         Api.agents: Agents,
@@ -83,10 +83,17 @@ def api_protocol_map() -> Dict[Api, Any]:
     }
 
 
-def additional_protocols_map() -> Dict[Api, Any]:
+def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
+    return {
+        **api_protocol_map(),
+        Api.inference: InferenceProvider,
+    }
+
+
+def additional_protocols_map() -> dict[Api, Any]:
     return {
         Api.inference: (ModelsProtocolPrivate, Models, Api.models),
-        Api.tool_groups: (ToolsProtocolPrivate, ToolGroups, Api.tool_groups),
+        Api.tool_groups: (ToolGroupsProtocolPrivate, ToolGroups, Api.tool_groups),
         Api.vector_io: (VectorDBsProtocolPrivate, VectorDBs, Api.vector_dbs),
         Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields),
         Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets),
@@ -104,14 +111,14 @@ class ProviderWithSpec(Provider):
     spec: ProviderSpec
 
 
-ProviderRegistry = Dict[Api, Dict[str, ProviderSpec]]
+ProviderRegistry = dict[Api, dict[str, ProviderSpec]]
 
 
 async def resolve_impls(
     run_config: StackRunConfig,
     provider_registry: ProviderRegistry,
     dist_registry: DistributionRegistry,
-) -> Dict[Api, Any]:
+) -> dict[Api, Any]:
     """
     Resolves provider implementations by:
     1. Validating and organizing providers.
@@ -133,10 +140,10 @@ async def resolve_impls(
 
     sorted_providers = sort_providers_by_deps(providers_with_specs, run_config)
 
-    return await instantiate_providers(sorted_providers, router_apis, dist_registry)
+    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config)
 
 
-def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str, Dict[str, ProviderWithSpec]]:
+def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str, dict[str, ProviderWithSpec]]:
     """Generates specifications for automatically routed APIs."""
     specs = {}
     for info in builtin_automatically_routed_apis():
@@ -178,10 +185,10 @@ def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str,
 
 
 def validate_and_prepare_providers(
-    run_config: StackRunConfig, provider_registry: ProviderRegistry, routing_table_apis: Set[Api], router_apis: Set[Api]
-) -> Dict[str, Dict[str, ProviderWithSpec]]:
+    run_config: StackRunConfig, provider_registry: ProviderRegistry, routing_table_apis: set[Api], router_apis: set[Api]
+) -> dict[str, dict[str, ProviderWithSpec]]:
     """Validates providers, handles deprecations, and organizes them into a spec dictionary."""
-    providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]] = {}
+    providers_with_specs: dict[str, dict[str, ProviderWithSpec]] = {}
 
     for api_str, providers in run_config.providers.items():
         api = Api(api_str)
@@ -222,10 +229,10 @@ def validate_provider(provider: Provider, api: Api, provider_registry: ProviderR
 
 
 def sort_providers_by_deps(
-    providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]], run_config: StackRunConfig
-) -> List[Tuple[str, ProviderWithSpec]]:
+    providers_with_specs: dict[str, dict[str, ProviderWithSpec]], run_config: StackRunConfig
+) -> list[tuple[str, ProviderWithSpec]]:
     """Sorts providers based on their dependencies."""
-    sorted_providers: List[Tuple[str, ProviderWithSpec]] = topological_sort(
+    sorted_providers: list[tuple[str, ProviderWithSpec]] = topological_sort(
         {k: list(v.values()) for k, v in providers_with_specs.items()}
     )
 
@@ -236,11 +243,14 @@ def sort_providers_by_deps(
 
 
 async def instantiate_providers(
-    sorted_providers: List[Tuple[str, ProviderWithSpec]], router_apis: Set[Api], dist_registry: DistributionRegistry
-) -> Dict:
+    sorted_providers: list[tuple[str, ProviderWithSpec]],
+    router_apis: set[Api],
+    dist_registry: DistributionRegistry,
+    run_config: StackRunConfig,
+) -> dict:
     """Instantiates providers asynchronously while managing dependencies."""
-    impls: Dict[Api, Any] = {}
-    inner_impls_by_provider_id: Dict[str, Dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
+    impls: dict[Api, Any] = {}
+    inner_impls_by_provider_id: dict[str, dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
     for api_str, provider in sorted_providers:
         deps = {a: impls[a] for a in provider.spec.api_dependencies}
         for a in provider.spec.optional_api_dependencies:
@@ -251,7 +261,7 @@ async def instantiate_providers(
         if isinstance(provider.spec, RoutingTableProviderSpec):
             inner_impls = inner_impls_by_provider_id[f"inner-{provider.spec.router_api.value}"]
 
-        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry)
+        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry, run_config)
 
         if api_str.startswith("inner-"):
             inner_impls_by_provider_id[api_str][provider.provider_id] = impl
@@ -263,9 +273,9 @@ async def instantiate_providers(
 
 
 def topological_sort(
-    providers_with_specs: Dict[str, List[ProviderWithSpec]],
-) -> List[Tuple[str, ProviderWithSpec]]:
-    def dfs(kv, visited: Set[str], stack: List[str]):
+    providers_with_specs: dict[str, list[ProviderWithSpec]],
+) -> list[tuple[str, ProviderWithSpec]]:
+    def dfs(kv, visited: set[str], stack: list[str]):
         api_str, providers = kv
         visited.add(api_str)
 
@@ -280,8 +290,8 @@ def topological_sort(
 
         stack.append(api_str)
 
-    visited: Set[str] = set()
-    stack: List[str] = []
+    visited: set[str] = set()
+    stack: list[str] = []
 
     for api_str, providers in providers_with_specs.items():
         if api_str not in visited:
@@ -298,13 +308,11 @@ def topological_sort(
 # returns a class implementing the protocol corresponding to the Api
 async def instantiate_provider(
     provider: ProviderWithSpec,
-    deps: Dict[Api, Any],
-    inner_impls: Dict[str, Any],
+    deps: dict[Api, Any],
+    inner_impls: dict[str, Any],
     dist_registry: DistributionRegistry,
+    run_config: StackRunConfig,
 ):
-    protocols = api_protocol_map()
-    additional_protocols = additional_protocols_map()
-
     provider_spec = provider.spec
     if not hasattr(provider_spec, "module"):
         raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
@@ -323,7 +331,7 @@ async def instantiate_provider(
         method = "get_auto_router_impl"
 
         config = None
-        args = [provider_spec.api, deps[provider_spec.routing_table_api], deps]
+        args = [provider_spec.api, deps[provider_spec.routing_table_api], deps, run_config]
     elif isinstance(provider_spec, RoutingTableProviderSpec):
         method = "get_routing_table_impl"
 
@@ -342,6 +350,8 @@ async def instantiate_provider(
     impl.__provider_spec__ = provider_spec
     impl.__provider_config__ = config
 
+    protocols = api_protocol_map_for_compliance_check()
+    additional_protocols = additional_protocols_map()
     # TODO: check compliance for special tool groups
     # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
     check_protocol_compliance(impl, protocols[provider_spec.api])
@@ -391,8 +401,8 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None:
 
 async def resolve_remote_stack_impls(
     config: RemoteProviderConfig,
-    apis: List[str],
-) -> Dict[Api, Any]:
+    apis: list[str],
+) -> dict[Api, Any]:
     protocols = api_protocol_map()
     additional_protocols = additional_protocols_map()
 
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index d0fca8771..1358d5812 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -4,29 +4,29 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import RoutedProtocol
+from llama_stack.distribution.stack import StackRunConfig
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
-
-from .routing_tables import (
-    BenchmarksRoutingTable,
-    DatasetsRoutingTable,
-    ModelsRoutingTable,
-    ScoringFunctionsRoutingTable,
-    ShieldsRoutingTable,
-    ToolGroupsRoutingTable,
-    VectorDBsRoutingTable,
-)
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
 
 
 async def get_routing_table_impl(
     api: Api,
-    impls_by_provider_id: Dict[str, RoutedProtocol],
+    impls_by_provider_id: dict[str, RoutedProtocol],
     _deps,
     dist_registry: DistributionRegistry,
 ) -> Any:
+    from ..routing_tables.benchmarks import BenchmarksRoutingTable
+    from ..routing_tables.datasets import DatasetsRoutingTable
+    from ..routing_tables.models import ModelsRoutingTable
+    from ..routing_tables.scoring_functions import ScoringFunctionsRoutingTable
+    from ..routing_tables.shields import ShieldsRoutingTable
+    from ..routing_tables.toolgroups import ToolGroupsRoutingTable
+    from ..routing_tables.vector_dbs import VectorDBsRoutingTable
+
     api_to_tables = {
         "vector_dbs": VectorDBsRoutingTable,
         "models": ModelsRoutingTable,
@@ -45,16 +45,15 @@ async def get_routing_table_impl(
     return impl
 
 
-async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict[str, Any]) -> Any:
-    from .routers import (
-        DatasetIORouter,
-        EvalRouter,
-        InferenceRouter,
-        SafetyRouter,
-        ScoringRouter,
-        ToolRuntimeRouter,
-        VectorIORouter,
-    )
+async def get_auto_router_impl(
+    api: Api, routing_table: RoutingTable, deps: dict[str, Any], run_config: StackRunConfig
+) -> Any:
+    from .datasets import DatasetIORouter
+    from .eval_scoring import EvalRouter, ScoringRouter
+    from .inference import InferenceRouter
+    from .safety import SafetyRouter
+    from .tool_runtime import ToolRuntimeRouter
+    from .vector_io import VectorIORouter
 
     api_to_routers = {
         "vector_io": VectorIORouter,
@@ -76,6 +75,12 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict
         if dep_api in deps:
             api_to_dep_impl[dep_name] = deps[dep_api]
 
+    # TODO: move pass configs to routers instead
+    if api == Api.inference and run_config.inference_store:
+        inference_store = InferenceStore(run_config.inference_store)
+        await inference_store.initialize()
+        api_to_dep_impl["store"] = inference_store
+
     impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
     await impl.initialize()
     return impl
diff --git a/llama_stack/distribution/routers/datasets.py b/llama_stack/distribution/routers/datasets.py
new file mode 100644
index 000000000..6f28756c9
--- /dev/null
+++ b/llama_stack/distribution/routers/datasets.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import DatasetPurpose, DataSource
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class DatasetIORouter(DatasetIO):
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing DatasetIORouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("DatasetIORouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("DatasetIORouter.shutdown")
+        pass
+
+    async def register_dataset(
+        self,
+        purpose: DatasetPurpose,
+        source: DataSource,
+        metadata: dict[str, Any] | None = None,
+        dataset_id: str | None = None,
+    ) -> None:
+        logger.debug(
+            f"DatasetIORouter.register_dataset: {purpose=} {source=} {metadata=} {dataset_id=}",
+        )
+        await self.routing_table.register_dataset(
+            purpose=purpose,
+            source=source,
+            metadata=metadata,
+            dataset_id=dataset_id,
+        )
+
+    async def iterrows(
+        self,
+        dataset_id: str,
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
+        logger.debug(
+            f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
+        )
+        return await self.routing_table.get_provider_impl(dataset_id).iterrows(
+            dataset_id=dataset_id,
+            start_index=start_index,
+            limit=limit,
+        )
+
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        logger.debug(f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
+        return await self.routing_table.get_provider_impl(dataset_id).append_rows(
+            dataset_id=dataset_id,
+            rows=rows,
+        )
diff --git a/llama_stack/distribution/routers/eval_scoring.py b/llama_stack/distribution/routers/eval_scoring.py
new file mode 100644
index 000000000..fd0bb90a7
--- /dev/null
+++ b/llama_stack/distribution/routers/eval_scoring.py
@@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
+from llama_stack.apis.scoring import (
+    ScoreBatchResponse,
+    ScoreResponse,
+    Scoring,
+    ScoringFnParams,
+)
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ScoringRouter(Scoring):
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing ScoringRouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("ScoringRouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("ScoringRouter.shutdown")
+        pass
+
+    async def score_batch(
+        self,
+        dataset_id: str,
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
+        save_results_dataset: bool = False,
+    ) -> ScoreBatchResponse:
+        logger.debug(f"ScoringRouter.score_batch: {dataset_id}")
+        res = {}
+        for fn_identifier in scoring_functions.keys():
+            score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch(
+                dataset_id=dataset_id,
+                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
+            )
+            res.update(score_response.results)
+
+        if save_results_dataset:
+            raise NotImplementedError("Save results dataset not implemented yet")
+
+        return ScoreBatchResponse(
+            results=res,
+        )
+
+    async def score(
+        self,
+        input_rows: list[dict[str, Any]],
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
+    ) -> ScoreResponse:
+        logger.debug(f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions")
+        res = {}
+        # look up and map each scoring function to its provider impl
+        for fn_identifier in scoring_functions.keys():
+            score_response = await self.routing_table.get_provider_impl(fn_identifier).score(
+                input_rows=input_rows,
+                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
+            )
+            res.update(score_response.results)
+
+        return ScoreResponse(results=res)
+
+
+class EvalRouter(Eval):
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing EvalRouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("EvalRouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("EvalRouter.shutdown")
+        pass
+
+    async def run_eval(
+        self,
+        benchmark_id: str,
+        benchmark_config: BenchmarkConfig,
+    ) -> Job:
+        logger.debug(f"EvalRouter.run_eval: {benchmark_id}")
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
+            benchmark_config=benchmark_config,
+        )
+
+    async def evaluate_rows(
+        self,
+        benchmark_id: str,
+        input_rows: list[dict[str, Any]],
+        scoring_functions: list[str],
+        benchmark_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            benchmark_config=benchmark_config,
+        )
+
+    async def job_status(
+        self,
+        benchmark_id: str,
+        job_id: str,
+    ) -> Job:
+        logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}")
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
+
+    async def job_cancel(
+        self,
+        benchmark_id: str,
+        job_id: str,
+    ) -> None:
+        logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
+            job_id,
+        )
+
+    async def job_result(
+        self,
+        benchmark_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}")
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
+            job_id,
+        )
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/inference.py
similarity index 52%
rename from llama_stack/distribution/routers/routers.py
rename to llama_stack/distribution/routers/inference.py
index d88df00bd..763bd9105 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/inference.py
@@ -6,22 +6,17 @@
 
 import asyncio
 import time
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Annotated, Any
 
 from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
 from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
 from pydantic import Field, TypeAdapter
-from typing_extensions import Annotated
 
 from llama_stack.apis.common.content_types import (
-    URL,
     InterleavedContent,
     InterleavedContentItem,
 )
-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import DatasetPurpose, DataSource
-from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
 from llama_stack.apis.inference import (
     BatchChatCompletionResponse,
     BatchCompletionResponse,
@@ -32,8 +27,11 @@ from llama_stack.apis.inference import (
     EmbeddingsResponse,
     EmbeddingTaskType,
     Inference,
+    ListOpenAIChatCompletionResponse,
     LogProbConfig,
     Message,
+    OpenAICompletionWithInputMessages,
+    Order,
     ResponseFormat,
     SamplingParams,
     StopReason,
@@ -47,104 +45,36 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingsResponse,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
 from llama_stack.apis.models import Model, ModelType
-from llama_stack.apis.safety import RunShieldResponse, Safety
-from llama_stack.apis.scoring import (
-    ScoreBatchResponse,
-    ScoreResponse,
-    Scoring,
-    ScoringFnParams,
-)
-from llama_stack.apis.shields import Shield
 from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
-from llama_stack.apis.tools import (
-    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
-    ToolRuntime,
-)
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
 from llama_stack.providers.utils.telemetry.tracing import get_current_span
 
 logger = get_logger(name=__name__, category="core")
 
 
-class VectorIORouter(VectorIO):
-    """Routes to an provider based on the vector db identifier"""
-
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing VectorIORouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("VectorIORouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("VectorIORouter.shutdown")
-        pass
-
-    async def register_vector_db(
-        self,
-        vector_db_id: str,
-        embedding_model: str,
-        embedding_dimension: Optional[int] = 384,
-        provider_id: Optional[str] = None,
-        provider_vector_db_id: Optional[str] = None,
-    ) -> None:
-        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
-        await self.routing_table.register_vector_db(
-            vector_db_id,
-            embedding_model,
-            embedding_dimension,
-            provider_id,
-            provider_vector_db_id,
-        )
-
-    async def insert_chunks(
-        self,
-        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
-    ) -> None:
-        logger.debug(
-            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
-        )
-        return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds)
-
-    async def query_chunks(
-        self,
-        vector_db_id: str,
-        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> QueryChunksResponse:
-        logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
-        return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params)
-
-
 class InferenceRouter(Inference):
     """Routes to an provider based on the model"""
 
     def __init__(
         self,
         routing_table: RoutingTable,
-        telemetry: Optional[Telemetry] = None,
+        telemetry: Telemetry | None = None,
+        store: InferenceStore | None = None,
     ) -> None:
         logger.debug("Initializing InferenceRouter")
         self.routing_table = routing_table
         self.telemetry = telemetry
+        self.store = store
         if self.telemetry:
             self.tokenizer = Tokenizer.get_instance()
             self.formatter = ChatFormat(self.tokenizer)
@@ -160,10 +90,10 @@ class InferenceRouter(Inference):
     async def register_model(
         self,
         model_id: str,
-        provider_model_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        model_type: Optional[ModelType] = None,
+        provider_model_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        model_type: ModelType | None = None,
     ) -> None:
         logger.debug(
             f"InferenceRouter.register_model: {model_id=} {provider_model_id=} {provider_id=} {metadata=} {model_type=}",
@@ -176,7 +106,7 @@ class InferenceRouter(Inference):
         completion_tokens: int,
         total_tokens: int,
         model: Model,
-    ) -> List[MetricEvent]:
+    ) -> list[MetricEvent]:
         """Constructs a list of MetricEvent objects containing token usage metrics.
 
         Args:
@@ -221,7 +151,7 @@ class InferenceRouter(Inference):
         completion_tokens: int,
         total_tokens: int,
         model: Model,
-    ) -> List[MetricInResponse]:
+    ) -> list[MetricInResponse]:
         metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
         if self.telemetry:
             for metric in metrics:
@@ -230,9 +160,9 @@ class InferenceRouter(Inference):
 
     async def _count_tokens(
         self,
-        messages: List[Message] | InterleavedContent,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-    ) -> Optional[int]:
+        messages: list[Message] | InterleavedContent,
+        tool_prompt_format: ToolPromptFormat | None = None,
+    ) -> int | None:
         if isinstance(messages, list):
             encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format)
         else:
@@ -242,16 +172,16 @@ class InferenceRouter(Inference):
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = None,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = None,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         logger.debug(
             f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
         )
@@ -351,12 +281,12 @@ class InferenceRouter(Inference):
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ) -> BatchChatCompletionResponse:
         logger.debug(
             f"InferenceRouter.batch_chat_completion: {model_id=}, {len(messages_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
@@ -376,10 +306,10 @@ class InferenceRouter(Inference):
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -439,10 +369,10 @@ class InferenceRouter(Inference):
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ) -> BatchCompletionResponse:
         logger.debug(
             f"InferenceRouter.batch_completion: {model_id=}, {len(content_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
@@ -453,10 +383,10 @@ class InferenceRouter(Inference):
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         logger.debug(f"InferenceRouter.embeddings: {model_id}")
         model = await self.routing_table.get_model(model_id)
@@ -475,24 +405,24 @@ class InferenceRouter(Inference):
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         logger.debug(
             f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
@@ -531,29 +461,29 @@ class InferenceRouter(Inference):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: Annotated[List[OpenAIMessageParam], Field(..., min_length=1)],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         logger.debug(
             f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
         )
@@ -573,6 +503,12 @@ class InferenceRouter(Inference):
             for tool in tools:
                 TypeAdapter(OpenAIChatCompletionToolParam).validate_python(tool)
 
+        # Some providers make tool calls even when tool_choice is "none"
+        # so just clear them both out to avoid unexpected tool calls
+        if tool_choice == "none" and tools is not None:
+            tool_choice = None
+            tools = None
+
         params = dict(
             model=model_obj.identifier,
             messages=messages,
@@ -600,9 +536,71 @@ class InferenceRouter(Inference):
         )
 
         provider = self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_chat_completion(**params)
+        if stream:
+            response_stream = await provider.openai_chat_completion(**params)
+            if self.store:
+                return stream_and_store_openai_completion(response_stream, model, self.store, messages)
+            return response_stream
+        else:
+            response = await self._nonstream_openai_chat_completion(provider, params)
+            if self.store:
+                await self.store.store_chat_completion(response, messages)
+            return response
 
-    async def health(self) -> Dict[str, HealthResponse]:
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        logger.debug(
+            f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
+        )
+        model_obj = await self.routing_table.get_model(model)
+        if model_obj is None:
+            raise ValueError(f"Model '{model}' not found")
+        if model_obj.model_type != ModelType.embedding:
+            raise ValueError(f"Model '{model}' is not an embedding model")
+
+        params = dict(
+            model=model_obj.identifier,
+            input=input,
+            encoding_format=encoding_format,
+            dimensions=dimensions,
+            user=user,
+        )
+
+        provider = self.routing_table.get_provider_impl(model_obj.identifier)
+        return await provider.openai_embeddings(**params)
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        if self.store:
+            return await self.store.list_chat_completions(after, limit, model, order)
+        raise NotImplementedError("List chat completions is not supported: inference store is not configured.")
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        if self.store:
+            return await self.store.get_chat_completion(completion_id)
+        raise NotImplementedError("Get chat completion is not supported: inference store is not configured.")
+
+    async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion:
+        response = await provider.openai_chat_completion(**params)
+        for choice in response.choices:
+            # some providers return an empty list for no tool calls in non-streaming responses
+            # but the OpenAI API returns None. So, set tool_calls to None if it's empty
+            if choice.message and choice.message.tool_calls is not None and len(choice.message.tool_calls) == 0:
+                choice.message.tool_calls = None
+        return response
+
+    async def health(self) -> dict[str, HealthResponse]:
         health_statuses = {}
         timeout = 0.5
         for provider_id, impl in self.routing_table.impls_by_provider_id.items():
@@ -612,7 +610,7 @@ class InferenceRouter(Inference):
                     continue
                 health = await asyncio.wait_for(impl.health(), timeout=timeout)
                 health_statuses[provider_id] = health
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 health_statuses[provider_id] = HealthResponse(
                     status=HealthStatus.ERROR,
                     message=f"Health check timed out after {timeout} seconds",
@@ -624,295 +622,3 @@ class InferenceRouter(Inference):
                     status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
                 )
         return health_statuses
-
-
-class SafetyRouter(Safety):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing SafetyRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("SafetyRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("SafetyRouter.shutdown")
-        pass
-
-    async def register_shield(
-        self,
-        shield_id: str,
-        provider_shield_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> Shield:
-        logger.debug(f"SafetyRouter.register_shield: {shield_id}")
-        return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
-
-    async def run_shield(
-        self,
-        shield_id: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
-    ) -> RunShieldResponse:
-        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
-        return await self.routing_table.get_provider_impl(shield_id).run_shield(
-            shield_id=shield_id,
-            messages=messages,
-            params=params,
-        )
-
-
-class DatasetIORouter(DatasetIO):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing DatasetIORouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("DatasetIORouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("DatasetIORouter.shutdown")
-        pass
-
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: Optional[Dict[str, Any]] = None,
-        dataset_id: Optional[str] = None,
-    ) -> None:
-        logger.debug(
-            f"DatasetIORouter.register_dataset: {purpose=} {source=} {metadata=} {dataset_id=}",
-        )
-        await self.routing_table.register_dataset(
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-            dataset_id=dataset_id,
-        )
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
-    ) -> PaginatedResponse:
-        logger.debug(
-            f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
-        )
-        return await self.routing_table.get_provider_impl(dataset_id).iterrows(
-            dataset_id=dataset_id,
-            start_index=start_index,
-            limit=limit,
-        )
-
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
-        logger.debug(f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
-        return await self.routing_table.get_provider_impl(dataset_id).append_rows(
-            dataset_id=dataset_id,
-            rows=rows,
-        )
-
-
-class ScoringRouter(Scoring):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing ScoringRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("ScoringRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("ScoringRouter.shutdown")
-        pass
-
-    async def score_batch(
-        self,
-        dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
-        save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse:
-        logger.debug(f"ScoringRouter.score_batch: {dataset_id}")
-        res = {}
-        for fn_identifier in scoring_functions.keys():
-            score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch(
-                dataset_id=dataset_id,
-                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
-            )
-            res.update(score_response.results)
-
-        if save_results_dataset:
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res,
-        )
-
-    async def score(
-        self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
-    ) -> ScoreResponse:
-        logger.debug(f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions")
-        res = {}
-        # look up and map each scoring function to its provider impl
-        for fn_identifier in scoring_functions.keys():
-            score_response = await self.routing_table.get_provider_impl(fn_identifier).score(
-                input_rows=input_rows,
-                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
-            )
-            res.update(score_response.results)
-
-        return ScoreResponse(results=res)
-
-
-class EvalRouter(Eval):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing EvalRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("EvalRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("EvalRouter.shutdown")
-        pass
-
-    async def run_eval(
-        self,
-        benchmark_id: str,
-        benchmark_config: BenchmarkConfig,
-    ) -> Job:
-        logger.debug(f"EvalRouter.run_eval: {benchmark_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
-            benchmark_id=benchmark_id,
-            benchmark_config=benchmark_config,
-        )
-
-    async def evaluate_rows(
-        self,
-        benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
-        benchmark_config: BenchmarkConfig,
-    ) -> EvaluateResponse:
-        logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
-        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
-            benchmark_id=benchmark_id,
-            input_rows=input_rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=benchmark_config,
-        )
-
-    async def job_status(
-        self,
-        benchmark_id: str,
-        job_id: str,
-    ) -> Job:
-        logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
-
-    async def job_cancel(
-        self,
-        benchmark_id: str,
-        job_id: str,
-    ) -> None:
-        logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
-        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
-            benchmark_id,
-            job_id,
-        )
-
-    async def job_result(
-        self,
-        benchmark_id: str,
-        job_id: str,
-    ) -> EvaluateResponse:
-        logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
-            benchmark_id,
-            job_id,
-        )
-
-
-class ToolRuntimeRouter(ToolRuntime):
-    class RagToolImpl(RAGToolRuntime):
-        def __init__(
-            self,
-            routing_table: RoutingTable,
-        ) -> None:
-            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
-            self.routing_table = routing_table
-
-        async def query(
-            self,
-            content: InterleavedContent,
-            vector_db_ids: List[str],
-            query_config: Optional[RAGQueryConfig] = None,
-        ) -> RAGQueryResult:
-            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
-            return await self.routing_table.get_provider_impl("knowledge_search").query(
-                content, vector_db_ids, query_config
-            )
-
-        async def insert(
-            self,
-            documents: List[RAGDocument],
-            vector_db_id: str,
-            chunk_size_in_tokens: int = 512,
-        ) -> None:
-            logger.debug(
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
-            )
-            return await self.routing_table.get_provider_impl("insert_into_memory").insert(
-                documents, vector_db_id, chunk_size_in_tokens
-            )
-
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing ToolRuntimeRouter")
-        self.routing_table = routing_table
-
-        # HACK ALERT this should be in sync with "get_all_api_endpoints()"
-        self.rag_tool = self.RagToolImpl(routing_table)
-        for method in ("query", "insert"):
-            setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
-
-    async def initialize(self) -> None:
-        logger.debug("ToolRuntimeRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("ToolRuntimeRouter.shutdown")
-        pass
-
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> Any:
-        logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
-        return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
-            tool_name=tool_name,
-            kwargs=kwargs,
-        )
-
-    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> ListToolDefsResponse:
-        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
-        return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
deleted file mode 100644
index 18b0c891f..000000000
--- a/llama_stack/distribution/routers/routing_tables.py
+++ /dev/null
@@ -1,631 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import logging
-import time
-import uuid
-from typing import Any, Dict, List, Optional
-
-from pydantic import TypeAdapter
-
-from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.datasets import (
-    Dataset,
-    DatasetPurpose,
-    Datasets,
-    DatasetType,
-    DataSource,
-    ListDatasetsResponse,
-    RowsDataSource,
-    URIDataSource,
-)
-from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.scoring_functions import (
-    ListScoringFunctionsResponse,
-    ScoringFn,
-    ScoringFnParams,
-    ScoringFunctions,
-)
-from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
-from llama_stack.apis.tools import (
-    ListToolGroupsResponse,
-    ListToolsResponse,
-    Tool,
-    ToolGroup,
-    ToolGroups,
-    ToolHost,
-)
-from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
-from llama_stack.distribution.access_control import check_access
-from llama_stack.distribution.datatypes import (
-    AccessAttributes,
-    BenchmarkWithACL,
-    DatasetWithACL,
-    ModelWithACL,
-    RoutableObject,
-    RoutableObjectWithProvider,
-    RoutedProtocol,
-    ScoringFnWithACL,
-    ShieldWithACL,
-    ToolGroupWithACL,
-    ToolWithACL,
-    VectorDBWithACL,
-)
-from llama_stack.distribution.request_headers import get_auth_attributes
-from llama_stack.distribution.store import DistributionRegistry
-from llama_stack.providers.datatypes import Api, RoutingTable
-
-logger = logging.getLogger(__name__)
-
-
-def get_impl_api(p: Any) -> Api:
-    return p.__provider_spec__.api
-
-
-# TODO: this should return the registered object for all APIs
-async def register_object_with_provider(obj: RoutableObject, p: Any) -> RoutableObject:
-    api = get_impl_api(p)
-
-    assert obj.provider_id != "remote", "Remote provider should not be registered"
-
-    if api == Api.inference:
-        return await p.register_model(obj)
-    elif api == Api.safety:
-        return await p.register_shield(obj)
-    elif api == Api.vector_io:
-        return await p.register_vector_db(obj)
-    elif api == Api.datasetio:
-        return await p.register_dataset(obj)
-    elif api == Api.scoring:
-        return await p.register_scoring_function(obj)
-    elif api == Api.eval:
-        return await p.register_benchmark(obj)
-    elif api == Api.tool_runtime:
-        return await p.register_tool(obj)
-    else:
-        raise ValueError(f"Unknown API {api} for registering object with provider")
-
-
-async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
-    api = get_impl_api(p)
-    if api == Api.vector_io:
-        return await p.unregister_vector_db(obj.identifier)
-    elif api == Api.inference:
-        return await p.unregister_model(obj.identifier)
-    elif api == Api.datasetio:
-        return await p.unregister_dataset(obj.identifier)
-    elif api == Api.tool_runtime:
-        return await p.unregister_tool(obj.identifier)
-    else:
-        raise ValueError(f"Unregister not supported for {api}")
-
-
-Registry = Dict[str, List[RoutableObjectWithProvider]]
-
-
-class CommonRoutingTableImpl(RoutingTable):
-    def __init__(
-        self,
-        impls_by_provider_id: Dict[str, RoutedProtocol],
-        dist_registry: DistributionRegistry,
-    ) -> None:
-        self.impls_by_provider_id = impls_by_provider_id
-        self.dist_registry = dist_registry
-
-    async def initialize(self) -> None:
-        async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None:
-            for obj in objs:
-                if cls is None:
-                    obj.provider_id = provider_id
-                else:
-                    # Create a copy of the model data and explicitly set provider_id
-                    model_data = obj.model_dump()
-                    model_data["provider_id"] = provider_id
-                    obj = cls(**model_data)
-                await self.dist_registry.register(obj)
-
-        # Register all objects from providers
-        for pid, p in self.impls_by_provider_id.items():
-            api = get_impl_api(p)
-            if api == Api.inference:
-                p.model_store = self
-            elif api == Api.safety:
-                p.shield_store = self
-            elif api == Api.vector_io:
-                p.vector_db_store = self
-            elif api == Api.datasetio:
-                p.dataset_store = self
-            elif api == Api.scoring:
-                p.scoring_function_store = self
-                scoring_functions = await p.list_scoring_functions()
-                await add_objects(scoring_functions, pid, ScoringFn)
-            elif api == Api.eval:
-                p.benchmark_store = self
-            elif api == Api.tool_runtime:
-                p.tool_store = self
-
-    async def shutdown(self) -> None:
-        for p in self.impls_by_provider_id.values():
-            await p.shutdown()
-
-    def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any:
-        def apiname_object():
-            if isinstance(self, ModelsRoutingTable):
-                return ("Inference", "model")
-            elif isinstance(self, ShieldsRoutingTable):
-                return ("Safety", "shield")
-            elif isinstance(self, VectorDBsRoutingTable):
-                return ("VectorIO", "vector_db")
-            elif isinstance(self, DatasetsRoutingTable):
-                return ("DatasetIO", "dataset")
-            elif isinstance(self, ScoringFunctionsRoutingTable):
-                return ("Scoring", "scoring_function")
-            elif isinstance(self, BenchmarksRoutingTable):
-                return ("Eval", "benchmark")
-            elif isinstance(self, ToolGroupsRoutingTable):
-                return ("Tools", "tool")
-            else:
-                raise ValueError("Unknown routing table type")
-
-        apiname, objtype = apiname_object()
-
-        # Get objects from disk registry
-        obj = self.dist_registry.get_cached(objtype, routing_key)
-        if not obj:
-            provider_ids = list(self.impls_by_provider_id.keys())
-            if len(provider_ids) > 1:
-                provider_ids_str = f"any of the providers: {', '.join(provider_ids)}"
-            else:
-                provider_ids_str = f"provider: `{provider_ids[0]}`"
-            raise ValueError(
-                f"{objtype.capitalize()} `{routing_key}` not served by {provider_ids_str}. Make sure there is an {apiname} provider serving this {objtype}."
-            )
-
-        if not provider_id or provider_id == obj.provider_id:
-            return self.impls_by_provider_id[obj.provider_id]
-
-        raise ValueError(f"Provider not found for `{routing_key}`")
-
-    async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
-        # Get from disk registry
-        obj = await self.dist_registry.get(type, identifier)
-        if not obj:
-            return None
-
-        # Check if user has permission to access this object
-        if not check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes()):
-            logger.debug(f"Access denied to {type} '{identifier}' based on attribute mismatch")
-            return None
-
-        return obj
-
-    async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
-        await self.dist_registry.delete(obj.type, obj.identifier)
-        await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
-
-    async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
-        # if provider_id is not specified, pick an arbitrary one from existing entries
-        if not obj.provider_id and len(self.impls_by_provider_id) > 0:
-            obj.provider_id = list(self.impls_by_provider_id.keys())[0]
-
-        if obj.provider_id not in self.impls_by_provider_id:
-            raise ValueError(f"Provider `{obj.provider_id}` not found")
-
-        p = self.impls_by_provider_id[obj.provider_id]
-
-        # If object supports access control but no attributes set, use creator's attributes
-        if not obj.access_attributes:
-            creator_attributes = get_auth_attributes()
-            if creator_attributes:
-                obj.access_attributes = AccessAttributes(**creator_attributes)
-                logger.info(f"Setting access attributes for {obj.type} '{obj.identifier}' based on creator's identity")
-
-        registered_obj = await register_object_with_provider(obj, p)
-        # TODO: This needs to be fixed for all APIs once they return the registered object
-        if obj.type == ResourceType.model.value:
-            await self.dist_registry.register(registered_obj)
-            return registered_obj
-
-        else:
-            await self.dist_registry.register(obj)
-            return obj
-
-    async def get_all_with_type(self, type: str) -> List[RoutableObjectWithProvider]:
-        objs = await self.dist_registry.get_all()
-        filtered_objs = [obj for obj in objs if obj.type == type]
-
-        # Apply attribute-based access control filtering
-        if filtered_objs:
-            filtered_objs = [
-                obj
-                for obj in filtered_objs
-                if check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes())
-            ]
-
-        return filtered_objs
-
-
-class ModelsRoutingTable(CommonRoutingTableImpl, Models):
-    async def list_models(self) -> ListModelsResponse:
-        return ListModelsResponse(data=await self.get_all_with_type("model"))
-
-    async def openai_list_models(self) -> OpenAIListModelsResponse:
-        models = await self.get_all_with_type("model")
-        openai_models = [
-            OpenAIModel(
-                id=model.identifier,
-                object="model",
-                created=int(time.time()),
-                owned_by="llama_stack",
-            )
-            for model in models
-        ]
-        return OpenAIListModelsResponse(data=openai_models)
-
-    async def get_model(self, model_id: str) -> Model:
-        model = await self.get_object_by_identifier("model", model_id)
-        if model is None:
-            raise ValueError(f"Model '{model_id}' not found")
-        return model
-
-    async def register_model(
-        self,
-        model_id: str,
-        provider_model_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        model_type: Optional[ModelType] = None,
-    ) -> Model:
-        if provider_model_id is None:
-            provider_model_id = model_id
-        if provider_id is None:
-            # If provider_id not specified, use the only provider if it supports this model
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    f"No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}"
-                )
-        if metadata is None:
-            metadata = {}
-        if model_type is None:
-            model_type = ModelType.llm
-        if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
-            raise ValueError("Embedding model must have an embedding dimension in its metadata")
-        model = ModelWithACL(
-            identifier=model_id,
-            provider_resource_id=provider_model_id,
-            provider_id=provider_id,
-            metadata=metadata,
-            model_type=model_type,
-        )
-        registered_model = await self.register_object(model)
-        return registered_model
-
-    async def unregister_model(self, model_id: str) -> None:
-        existing_model = await self.get_model(model_id)
-        if existing_model is None:
-            raise ValueError(f"Model {model_id} not found")
-        await self.unregister_object(existing_model)
-
-
-class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
-    async def list_shields(self) -> ListShieldsResponse:
-        return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
-
-    async def get_shield(self, identifier: str) -> Shield:
-        shield = await self.get_object_by_identifier("shield", identifier)
-        if shield is None:
-            raise ValueError(f"Shield '{identifier}' not found")
-        return shield
-
-    async def register_shield(
-        self,
-        shield_id: str,
-        provider_shield_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[Dict[str, Any]] = None,
-    ) -> Shield:
-        if provider_shield_id is None:
-            provider_shield_id = shield_id
-        if provider_id is None:
-            # If provider_id not specified, use the only provider if it supports this shield type
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        if params is None:
-            params = {}
-        shield = ShieldWithACL(
-            identifier=shield_id,
-            provider_resource_id=provider_shield_id,
-            provider_id=provider_id,
-            params=params,
-        )
-        await self.register_object(shield)
-        return shield
-
-
-class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
-    async def list_vector_dbs(self) -> ListVectorDBsResponse:
-        return ListVectorDBsResponse(data=await self.get_all_with_type("vector_db"))
-
-    async def get_vector_db(self, vector_db_id: str) -> VectorDB:
-        vector_db = await self.get_object_by_identifier("vector_db", vector_db_id)
-        if vector_db is None:
-            raise ValueError(f"Vector DB '{vector_db_id}' not found")
-        return vector_db
-
-    async def register_vector_db(
-        self,
-        vector_db_id: str,
-        embedding_model: str,
-        embedding_dimension: Optional[int] = 384,
-        provider_id: Optional[str] = None,
-        provider_vector_db_id: Optional[str] = None,
-    ) -> VectorDB:
-        if provider_vector_db_id is None:
-            provider_vector_db_id = vector_db_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) > 0:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-                if len(self.impls_by_provider_id) > 1:
-                    logger.warning(
-                        f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
-                    )
-            else:
-                raise ValueError("No provider available. Please configure a vector_io provider.")
-        model = await self.get_object_by_identifier("model", embedding_model)
-        if model is None:
-            raise ValueError(f"Model {embedding_model} not found")
-        if model.model_type != ModelType.embedding:
-            raise ValueError(f"Model {embedding_model} is not an embedding model")
-        if "embedding_dimension" not in model.metadata:
-            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
-        vector_db_data = {
-            "identifier": vector_db_id,
-            "type": ResourceType.vector_db.value,
-            "provider_id": provider_id,
-            "provider_resource_id": provider_vector_db_id,
-            "embedding_model": embedding_model,
-            "embedding_dimension": model.metadata["embedding_dimension"],
-        }
-        vector_db = TypeAdapter(VectorDBWithACL).validate_python(vector_db_data)
-        await self.register_object(vector_db)
-        return vector_db
-
-    async def unregister_vector_db(self, vector_db_id: str) -> None:
-        existing_vector_db = await self.get_vector_db(vector_db_id)
-        if existing_vector_db is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found")
-        await self.unregister_object(existing_vector_db)
-
-
-class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
-    async def list_datasets(self) -> ListDatasetsResponse:
-        return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
-
-    async def get_dataset(self, dataset_id: str) -> Dataset:
-        dataset = await self.get_object_by_identifier("dataset", dataset_id)
-        if dataset is None:
-            raise ValueError(f"Dataset '{dataset_id}' not found")
-        return dataset
-
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: Optional[Dict[str, Any]] = None,
-        dataset_id: Optional[str] = None,
-    ) -> Dataset:
-        if isinstance(source, dict):
-            if source["type"] == "uri":
-                source = URIDataSource.parse_obj(source)
-            elif source["type"] == "rows":
-                source = RowsDataSource.parse_obj(source)
-
-        if not dataset_id:
-            dataset_id = f"dataset-{str(uuid.uuid4())}"
-
-        provider_dataset_id = dataset_id
-
-        # infer provider from source
-        if source.type == DatasetType.rows.value:
-            provider_id = "localfs"
-        elif source.type == DatasetType.uri.value:
-            # infer provider from uri
-            if source.uri.startswith("huggingface"):
-                provider_id = "huggingface"
-            else:
-                provider_id = "localfs"
-        else:
-            raise ValueError(f"Unknown data source type: {source.type}")
-
-        if metadata is None:
-            metadata = {}
-
-        dataset = DatasetWithACL(
-            identifier=dataset_id,
-            provider_resource_id=provider_dataset_id,
-            provider_id=provider_id,
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-        )
-
-        await self.register_object(dataset)
-        return dataset
-
-    async def unregister_dataset(self, dataset_id: str) -> None:
-        dataset = await self.get_dataset(dataset_id)
-        if dataset is None:
-            raise ValueError(f"Dataset {dataset_id} not found")
-        await self.unregister_object(dataset)
-
-
-class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
-        return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
-
-    async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
-        scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
-        if scoring_fn is None:
-            raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
-        return scoring_fn
-
-    async def register_scoring_function(
-        self,
-        scoring_fn_id: str,
-        description: str,
-        return_type: ParamType,
-        provider_scoring_fn_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        params: Optional[ScoringFnParams] = None,
-    ) -> None:
-        if provider_scoring_fn_id is None:
-            provider_scoring_fn_id = scoring_fn_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        scoring_fn = ScoringFnWithACL(
-            identifier=scoring_fn_id,
-            description=description,
-            return_type=return_type,
-            provider_resource_id=provider_scoring_fn_id,
-            provider_id=provider_id,
-            params=params,
-        )
-        scoring_fn.provider_id = provider_id
-        await self.register_object(scoring_fn)
-
-
-class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
-    async def list_benchmarks(self) -> ListBenchmarksResponse:
-        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
-
-    async def get_benchmark(self, benchmark_id: str) -> Benchmark:
-        benchmark = await self.get_object_by_identifier("benchmark", benchmark_id)
-        if benchmark is None:
-            raise ValueError(f"Benchmark '{benchmark_id}' not found")
-        return benchmark
-
-    async def register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        metadata: Optional[Dict[str, Any]] = None,
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-    ) -> None:
-        if metadata is None:
-            metadata = {}
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        if provider_benchmark_id is None:
-            provider_benchmark_id = benchmark_id
-        benchmark = BenchmarkWithACL(
-            identifier=benchmark_id,
-            dataset_id=dataset_id,
-            scoring_functions=scoring_functions,
-            metadata=metadata,
-            provider_id=provider_id,
-            provider_resource_id=provider_benchmark_id,
-        )
-        await self.register_object(benchmark)
-
-
-class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
-    async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:
-        tools = await self.get_all_with_type("tool")
-        if toolgroup_id:
-            tools = [tool for tool in tools if tool.toolgroup_id == toolgroup_id]
-        return ListToolsResponse(data=tools)
-
-    async def list_tool_groups(self) -> ListToolGroupsResponse:
-        return ListToolGroupsResponse(data=await self.get_all_with_type("tool_group"))
-
-    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
-        tool_group = await self.get_object_by_identifier("tool_group", toolgroup_id)
-        if tool_group is None:
-            raise ValueError(f"Tool group '{toolgroup_id}' not found")
-        return tool_group
-
-    async def get_tool(self, tool_name: str) -> Tool:
-        return await self.get_object_by_identifier("tool", tool_name)
-
-    async def register_tool_group(
-        self,
-        toolgroup_id: str,
-        provider_id: str,
-        mcp_endpoint: Optional[URL] = None,
-        args: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        tools = []
-        tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
-        tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
-
-        for tool_def in tool_defs.data:
-            tools.append(
-                ToolWithACL(
-                    identifier=tool_def.name,
-                    toolgroup_id=toolgroup_id,
-                    description=tool_def.description or "",
-                    parameters=tool_def.parameters or [],
-                    provider_id=provider_id,
-                    provider_resource_id=tool_def.name,
-                    metadata=tool_def.metadata,
-                    tool_host=tool_host,
-                )
-            )
-        for tool in tools:
-            existing_tool = await self.get_tool(tool.identifier)
-            # Compare existing and new object if one exists
-            if existing_tool:
-                existing_dict = existing_tool.model_dump()
-                new_dict = tool.model_dump()
-
-                if existing_dict != new_dict:
-                    raise ValueError(
-                        f"Object {tool.identifier} already exists in registry. Please use a different identifier."
-                    )
-            await self.register_object(tool)
-
-        await self.dist_registry.register(
-            ToolGroupWithACL(
-                identifier=toolgroup_id,
-                provider_id=provider_id,
-                provider_resource_id=toolgroup_id,
-                mcp_endpoint=mcp_endpoint,
-                args=args,
-            )
-        )
-
-    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
-        tool_group = await self.get_tool_group(toolgroup_id)
-        if tool_group is None:
-            raise ValueError(f"Tool group {toolgroup_id} not found")
-        tools = await self.list_tools(toolgroup_id)
-        for tool in getattr(tools, "data", []):
-            await self.unregister_object(tool)
-        await self.unregister_object(tool_group)
-
-    async def shutdown(self) -> None:
-        pass
diff --git a/llama_stack/distribution/routers/safety.py b/llama_stack/distribution/routers/safety.py
new file mode 100644
index 000000000..9761d2db0
--- /dev/null
+++ b/llama_stack/distribution/routers/safety.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.inference import (
+    Message,
+)
+from llama_stack.apis.safety import RunShieldResponse, Safety
+from llama_stack.apis.shields import Shield
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class SafetyRouter(Safety):
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing SafetyRouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("SafetyRouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("SafetyRouter.shutdown")
+        pass
+
+    async def register_shield(
+        self,
+        shield_id: str,
+        provider_shield_id: str | None = None,
+        provider_id: str | None = None,
+        params: dict[str, Any] | None = None,
+    ) -> Shield:
+        logger.debug(f"SafetyRouter.register_shield: {shield_id}")
+        return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
+
+    async def run_shield(
+        self,
+        shield_id: str,
+        messages: list[Message],
+        params: dict[str, Any] = None,
+    ) -> RunShieldResponse:
+        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
+        return await self.routing_table.get_provider_impl(shield_id).run_shield(
+            shield_id=shield_id,
+            messages=messages,
+            params=params,
+        )
diff --git a/llama_stack/distribution/routers/tool_runtime.py b/llama_stack/distribution/routers/tool_runtime.py
new file mode 100644
index 000000000..285843dbc
--- /dev/null
+++ b/llama_stack/distribution/routers/tool_runtime.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.common.content_types import (
+    URL,
+    InterleavedContent,
+)
+from llama_stack.apis.tools import (
+    ListToolsResponse,
+    RAGDocument,
+    RAGQueryConfig,
+    RAGQueryResult,
+    RAGToolRuntime,
+    ToolRuntime,
+)
+from llama_stack.log import get_logger
+
+from ..routing_tables.toolgroups import ToolGroupsRoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ToolRuntimeRouter(ToolRuntime):
+    class RagToolImpl(RAGToolRuntime):
+        def __init__(
+            self,
+            routing_table: ToolGroupsRoutingTable,
+        ) -> None:
+            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
+            self.routing_table = routing_table
+
+        async def query(
+            self,
+            content: InterleavedContent,
+            vector_db_ids: list[str],
+            query_config: RAGQueryConfig | None = None,
+        ) -> RAGQueryResult:
+            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
+            return await self.routing_table.get_provider_impl("knowledge_search").query(
+                content, vector_db_ids, query_config
+            )
+
+        async def insert(
+            self,
+            documents: list[RAGDocument],
+            vector_db_id: str,
+            chunk_size_in_tokens: int = 512,
+        ) -> None:
+            logger.debug(
+                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
+            )
+            return await self.routing_table.get_provider_impl("insert_into_memory").insert(
+                documents, vector_db_id, chunk_size_in_tokens
+            )
+
+    def __init__(
+        self,
+        routing_table: ToolGroupsRoutingTable,
+    ) -> None:
+        logger.debug("Initializing ToolRuntimeRouter")
+        self.routing_table = routing_table
+
+        # HACK ALERT this should be in sync with "get_all_api_endpoints()"
+        self.rag_tool = self.RagToolImpl(routing_table)
+        for method in ("query", "insert"):
+            setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
+
+    async def initialize(self) -> None:
+        logger.debug("ToolRuntimeRouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("ToolRuntimeRouter.shutdown")
+        pass
+
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> Any:
+        logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
+        return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
+            tool_name=tool_name,
+            kwargs=kwargs,
+        )
+
+    async def list_runtime_tools(
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
+    ) -> ListToolsResponse:
+        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
+        return await self.routing_table.list_tools(tool_group_id)
diff --git a/llama_stack/distribution/routers/vector_io.py b/llama_stack/distribution/routers/vector_io.py
new file mode 100644
index 000000000..8c17aa890
--- /dev/null
+++ b/llama_stack/distribution/routers/vector_io.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.common.content_types import (
+    InterleavedContent,
+)
+from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class VectorIORouter(VectorIO):
+    """Routes to an provider based on the vector db identifier"""
+
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing VectorIORouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("VectorIORouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("VectorIORouter.shutdown")
+        pass
+
+    async def register_vector_db(
+        self,
+        vector_db_id: str,
+        embedding_model: str,
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        provider_vector_db_id: str | None = None,
+    ) -> None:
+        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
+        await self.routing_table.register_vector_db(
+            vector_db_id,
+            embedding_model,
+            embedding_dimension,
+            provider_id,
+            provider_vector_db_id,
+        )
+
+    async def insert_chunks(
+        self,
+        vector_db_id: str,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
+    ) -> None:
+        logger.debug(
+            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
+        )
+        return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds)
+
+    async def query_chunks(
+        self,
+        vector_db_id: str,
+        query: InterleavedContent,
+        params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
+        return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params)
diff --git a/llama_stack/providers/tests/__init__.py b/llama_stack/distribution/routing_tables/__init__.py
similarity index 100%
rename from llama_stack/providers/tests/__init__.py
rename to llama_stack/distribution/routing_tables/__init__.py
diff --git a/llama_stack/distribution/routing_tables/benchmarks.py b/llama_stack/distribution/routing_tables/benchmarks.py
new file mode 100644
index 000000000..589a00c02
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/benchmarks.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
+from llama_stack.distribution.datatypes import (
+    BenchmarkWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
+
+    async def get_benchmark(self, benchmark_id: str) -> Benchmark:
+        benchmark = await self.get_object_by_identifier("benchmark", benchmark_id)
+        if benchmark is None:
+            raise ValueError(f"Benchmark '{benchmark_id}' not found")
+        return benchmark
+
+    async def register_benchmark(
+        self,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: list[str],
+        metadata: dict[str, Any] | None = None,
+        provider_benchmark_id: str | None = None,
+        provider_id: str | None = None,
+    ) -> None:
+        if metadata is None:
+            metadata = {}
+        if provider_id is None:
+            if len(self.impls_by_provider_id) == 1:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+            else:
+                raise ValueError(
+                    "No provider specified and multiple providers available. Please specify a provider_id."
+                )
+        if provider_benchmark_id is None:
+            provider_benchmark_id = benchmark_id
+        benchmark = BenchmarkWithACL(
+            identifier=benchmark_id,
+            dataset_id=dataset_id,
+            scoring_functions=scoring_functions,
+            metadata=metadata,
+            provider_id=provider_id,
+            provider_resource_id=provider_benchmark_id,
+        )
+        await self.register_object(benchmark)
diff --git a/llama_stack/distribution/routing_tables/common.py b/llama_stack/distribution/routing_tables/common.py
new file mode 100644
index 000000000..8ec87ca50
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/common.py
@@ -0,0 +1,218 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.resource import ResourceType
+from llama_stack.apis.scoring_functions import ScoringFn
+from llama_stack.distribution.access_control import check_access
+from llama_stack.distribution.datatypes import (
+    AccessAttributes,
+    RoutableObject,
+    RoutableObjectWithProvider,
+    RoutedProtocol,
+)
+from llama_stack.distribution.request_headers import get_auth_attributes
+from llama_stack.distribution.store import DistributionRegistry
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import Api, RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+def get_impl_api(p: Any) -> Api:
+    return p.__provider_spec__.api
+
+
+# TODO: this should return the registered object for all APIs
+async def register_object_with_provider(obj: RoutableObject, p: Any) -> RoutableObject:
+    api = get_impl_api(p)
+
+    assert obj.provider_id != "remote", "Remote provider should not be registered"
+
+    if api == Api.inference:
+        return await p.register_model(obj)
+    elif api == Api.safety:
+        return await p.register_shield(obj)
+    elif api == Api.vector_io:
+        return await p.register_vector_db(obj)
+    elif api == Api.datasetio:
+        return await p.register_dataset(obj)
+    elif api == Api.scoring:
+        return await p.register_scoring_function(obj)
+    elif api == Api.eval:
+        return await p.register_benchmark(obj)
+    elif api == Api.tool_runtime:
+        return await p.register_toolgroup(obj)
+    else:
+        raise ValueError(f"Unknown API {api} for registering object with provider")
+
+
+async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
+    api = get_impl_api(p)
+    if api == Api.vector_io:
+        return await p.unregister_vector_db(obj.identifier)
+    elif api == Api.inference:
+        return await p.unregister_model(obj.identifier)
+    elif api == Api.datasetio:
+        return await p.unregister_dataset(obj.identifier)
+    elif api == Api.tool_runtime:
+        return await p.unregister_toolgroup(obj.identifier)
+    else:
+        raise ValueError(f"Unregister not supported for {api}")
+
+
+Registry = dict[str, list[RoutableObjectWithProvider]]
+
+
+class CommonRoutingTableImpl(RoutingTable):
+    def __init__(
+        self,
+        impls_by_provider_id: dict[str, RoutedProtocol],
+        dist_registry: DistributionRegistry,
+    ) -> None:
+        self.impls_by_provider_id = impls_by_provider_id
+        self.dist_registry = dist_registry
+
+    async def initialize(self) -> None:
+        async def add_objects(objs: list[RoutableObjectWithProvider], provider_id: str, cls) -> None:
+            for obj in objs:
+                if cls is None:
+                    obj.provider_id = provider_id
+                else:
+                    # Create a copy of the model data and explicitly set provider_id
+                    model_data = obj.model_dump()
+                    model_data["provider_id"] = provider_id
+                    obj = cls(**model_data)
+                await self.dist_registry.register(obj)
+
+        # Register all objects from providers
+        for pid, p in self.impls_by_provider_id.items():
+            api = get_impl_api(p)
+            if api == Api.inference:
+                p.model_store = self
+            elif api == Api.safety:
+                p.shield_store = self
+            elif api == Api.vector_io:
+                p.vector_db_store = self
+            elif api == Api.datasetio:
+                p.dataset_store = self
+            elif api == Api.scoring:
+                p.scoring_function_store = self
+                scoring_functions = await p.list_scoring_functions()
+                await add_objects(scoring_functions, pid, ScoringFn)
+            elif api == Api.eval:
+                p.benchmark_store = self
+            elif api == Api.tool_runtime:
+                p.tool_store = self
+
+    async def shutdown(self) -> None:
+        for p in self.impls_by_provider_id.values():
+            await p.shutdown()
+
+    def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
+        from .benchmarks import BenchmarksRoutingTable
+        from .datasets import DatasetsRoutingTable
+        from .models import ModelsRoutingTable
+        from .scoring_functions import ScoringFunctionsRoutingTable
+        from .shields import ShieldsRoutingTable
+        from .toolgroups import ToolGroupsRoutingTable
+        from .vector_dbs import VectorDBsRoutingTable
+
+        def apiname_object():
+            if isinstance(self, ModelsRoutingTable):
+                return ("Inference", "model")
+            elif isinstance(self, ShieldsRoutingTable):
+                return ("Safety", "shield")
+            elif isinstance(self, VectorDBsRoutingTable):
+                return ("VectorIO", "vector_db")
+            elif isinstance(self, DatasetsRoutingTable):
+                return ("DatasetIO", "dataset")
+            elif isinstance(self, ScoringFunctionsRoutingTable):
+                return ("Scoring", "scoring_function")
+            elif isinstance(self, BenchmarksRoutingTable):
+                return ("Eval", "benchmark")
+            elif isinstance(self, ToolGroupsRoutingTable):
+                return ("ToolGroups", "tool_group")
+            else:
+                raise ValueError("Unknown routing table type")
+
+        apiname, objtype = apiname_object()
+
+        # Get objects from disk registry
+        obj = self.dist_registry.get_cached(objtype, routing_key)
+        if not obj:
+            provider_ids = list(self.impls_by_provider_id.keys())
+            if len(provider_ids) > 1:
+                provider_ids_str = f"any of the providers: {', '.join(provider_ids)}"
+            else:
+                provider_ids_str = f"provider: `{provider_ids[0]}`"
+            raise ValueError(
+                f"{objtype.capitalize()} `{routing_key}` not served by {provider_ids_str}. Make sure there is an {apiname} provider serving this {objtype}."
+            )
+
+        if not provider_id or provider_id == obj.provider_id:
+            return self.impls_by_provider_id[obj.provider_id]
+
+        raise ValueError(f"Provider not found for `{routing_key}`")
+
+    async def get_object_by_identifier(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
+        # Get from disk registry
+        obj = await self.dist_registry.get(type, identifier)
+        if not obj:
+            return None
+
+        # Check if user has permission to access this object
+        if not check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes()):
+            logger.debug(f"Access denied to {type} '{identifier}' based on attribute mismatch")
+            return None
+
+        return obj
+
+    async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
+        await self.dist_registry.delete(obj.type, obj.identifier)
+        await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
+
+    async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
+        # if provider_id is not specified, pick an arbitrary one from existing entries
+        if not obj.provider_id and len(self.impls_by_provider_id) > 0:
+            obj.provider_id = list(self.impls_by_provider_id.keys())[0]
+
+        if obj.provider_id not in self.impls_by_provider_id:
+            raise ValueError(f"Provider `{obj.provider_id}` not found")
+
+        p = self.impls_by_provider_id[obj.provider_id]
+
+        # If object supports access control but no attributes set, use creator's attributes
+        if not obj.access_attributes:
+            creator_attributes = get_auth_attributes()
+            if creator_attributes:
+                obj.access_attributes = AccessAttributes(**creator_attributes)
+                logger.info(f"Setting access attributes for {obj.type} '{obj.identifier}' based on creator's identity")
+
+        registered_obj = await register_object_with_provider(obj, p)
+        # TODO: This needs to be fixed for all APIs once they return the registered object
+        if obj.type == ResourceType.model.value:
+            await self.dist_registry.register(registered_obj)
+            return registered_obj
+
+        else:
+            await self.dist_registry.register(obj)
+            return obj
+
+    async def get_all_with_type(self, type: str) -> list[RoutableObjectWithProvider]:
+        objs = await self.dist_registry.get_all()
+        filtered_objs = [obj for obj in objs if obj.type == type]
+
+        # Apply attribute-based access control filtering
+        if filtered_objs:
+            filtered_objs = [
+                obj
+                for obj in filtered_objs
+                if check_access(obj.identifier, getattr(obj, "access_attributes", None), get_auth_attributes())
+            ]
+
+        return filtered_objs
diff --git a/llama_stack/distribution/routing_tables/datasets.py b/llama_stack/distribution/routing_tables/datasets.py
new file mode 100644
index 000000000..4401ad47e
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/datasets.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import uuid
+from typing import Any
+
+from llama_stack.apis.datasets import (
+    Dataset,
+    DatasetPurpose,
+    Datasets,
+    DatasetType,
+    DataSource,
+    ListDatasetsResponse,
+    RowsDataSource,
+    URIDataSource,
+)
+from llama_stack.apis.resource import ResourceType
+from llama_stack.distribution.datatypes import (
+    DatasetWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
+    async def list_datasets(self) -> ListDatasetsResponse:
+        return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
+
+    async def get_dataset(self, dataset_id: str) -> Dataset:
+        dataset = await self.get_object_by_identifier("dataset", dataset_id)
+        if dataset is None:
+            raise ValueError(f"Dataset '{dataset_id}' not found")
+        return dataset
+
+    async def register_dataset(
+        self,
+        purpose: DatasetPurpose,
+        source: DataSource,
+        metadata: dict[str, Any] | None = None,
+        dataset_id: str | None = None,
+    ) -> Dataset:
+        if isinstance(source, dict):
+            if source["type"] == "uri":
+                source = URIDataSource.parse_obj(source)
+            elif source["type"] == "rows":
+                source = RowsDataSource.parse_obj(source)
+
+        if not dataset_id:
+            dataset_id = f"dataset-{str(uuid.uuid4())}"
+
+        provider_dataset_id = dataset_id
+
+        # infer provider from source
+        if metadata:
+            if metadata.get("provider_id"):
+                provider_id = metadata.get("provider_id")  # pass through from nvidia datasetio
+        elif source.type == DatasetType.rows.value:
+            provider_id = "localfs"
+        elif source.type == DatasetType.uri.value:
+            # infer provider from uri
+            if source.uri.startswith("huggingface"):
+                provider_id = "huggingface"
+            else:
+                provider_id = "localfs"
+        else:
+            raise ValueError(f"Unknown data source type: {source.type}")
+
+        if metadata is None:
+            metadata = {}
+
+        dataset = DatasetWithACL(
+            identifier=dataset_id,
+            provider_resource_id=provider_dataset_id,
+            provider_id=provider_id,
+            purpose=purpose,
+            source=source,
+            metadata=metadata,
+        )
+
+        await self.register_object(dataset)
+        return dataset
+
+    async def unregister_dataset(self, dataset_id: str) -> None:
+        dataset = await self.get_dataset(dataset_id)
+        if dataset is None:
+            raise ValueError(f"Dataset {dataset_id} not found")
+        await self.unregister_object(dataset)
diff --git a/llama_stack/distribution/routing_tables/models.py b/llama_stack/distribution/routing_tables/models.py
new file mode 100644
index 000000000..7216d9935
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/models.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from typing import Any
+
+from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
+from llama_stack.distribution.datatypes import (
+    ModelWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ModelsRoutingTable(CommonRoutingTableImpl, Models):
+    async def list_models(self) -> ListModelsResponse:
+        return ListModelsResponse(data=await self.get_all_with_type("model"))
+
+    async def openai_list_models(self) -> OpenAIListModelsResponse:
+        models = await self.get_all_with_type("model")
+        openai_models = [
+            OpenAIModel(
+                id=model.identifier,
+                object="model",
+                created=int(time.time()),
+                owned_by="llama_stack",
+            )
+            for model in models
+        ]
+        return OpenAIListModelsResponse(data=openai_models)
+
+    async def get_model(self, model_id: str) -> Model:
+        model = await self.get_object_by_identifier("model", model_id)
+        if model is None:
+            raise ValueError(f"Model '{model_id}' not found")
+        return model
+
+    async def register_model(
+        self,
+        model_id: str,
+        provider_model_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        model_type: ModelType | None = None,
+    ) -> Model:
+        if provider_model_id is None:
+            provider_model_id = model_id
+        if provider_id is None:
+            # If provider_id not specified, use the only provider if it supports this model
+            if len(self.impls_by_provider_id) == 1:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+            else:
+                raise ValueError(
+                    f"No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}"
+                )
+        if metadata is None:
+            metadata = {}
+        if model_type is None:
+            model_type = ModelType.llm
+        if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
+            raise ValueError("Embedding model must have an embedding dimension in its metadata")
+        model = ModelWithACL(
+            identifier=model_id,
+            provider_resource_id=provider_model_id,
+            provider_id=provider_id,
+            metadata=metadata,
+            model_type=model_type,
+        )
+        registered_model = await self.register_object(model)
+        return registered_model
+
+    async def unregister_model(self, model_id: str) -> None:
+        existing_model = await self.get_model(model_id)
+        if existing_model is None:
+            raise ValueError(f"Model {model_id} not found")
+        await self.unregister_object(existing_model)
diff --git a/llama_stack/distribution/routing_tables/scoring_functions.py b/llama_stack/distribution/routing_tables/scoring_functions.py
new file mode 100644
index 000000000..d85f64b57
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/scoring_functions.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import ParamType
+from llama_stack.apis.resource import ResourceType
+from llama_stack.apis.scoring_functions import (
+    ListScoringFunctionsResponse,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctions,
+)
+from llama_stack.distribution.datatypes import (
+    ScoringFnWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
+    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
+        return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
+
+    async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
+        scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
+        if scoring_fn is None:
+            raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
+        return scoring_fn
+
+    async def register_scoring_function(
+        self,
+        scoring_fn_id: str,
+        description: str,
+        return_type: ParamType,
+        provider_scoring_fn_id: str | None = None,
+        provider_id: str | None = None,
+        params: ScoringFnParams | None = None,
+    ) -> None:
+        if provider_scoring_fn_id is None:
+            provider_scoring_fn_id = scoring_fn_id
+        if provider_id is None:
+            if len(self.impls_by_provider_id) == 1:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+            else:
+                raise ValueError(
+                    "No provider specified and multiple providers available. Please specify a provider_id."
+                )
+        scoring_fn = ScoringFnWithACL(
+            identifier=scoring_fn_id,
+            description=description,
+            return_type=return_type,
+            provider_resource_id=provider_scoring_fn_id,
+            provider_id=provider_id,
+            params=params,
+        )
+        scoring_fn.provider_id = provider_id
+        await self.register_object(scoring_fn)
diff --git a/llama_stack/distribution/routing_tables/shields.py b/llama_stack/distribution/routing_tables/shields.py
new file mode 100644
index 000000000..7f62596c9
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/shields.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.resource import ResourceType
+from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
+from llama_stack.distribution.datatypes import (
+    ShieldWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
+    async def list_shields(self) -> ListShieldsResponse:
+        return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
+
+    async def get_shield(self, identifier: str) -> Shield:
+        shield = await self.get_object_by_identifier("shield", identifier)
+        if shield is None:
+            raise ValueError(f"Shield '{identifier}' not found")
+        return shield
+
+    async def register_shield(
+        self,
+        shield_id: str,
+        provider_shield_id: str | None = None,
+        provider_id: str | None = None,
+        params: dict[str, Any] | None = None,
+    ) -> Shield:
+        if provider_shield_id is None:
+            provider_shield_id = shield_id
+        if provider_id is None:
+            # If provider_id not specified, use the only provider if it supports this shield type
+            if len(self.impls_by_provider_id) == 1:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+            else:
+                raise ValueError(
+                    "No provider specified and multiple providers available. Please specify a provider_id."
+                )
+        if params is None:
+            params = {}
+        shield = ShieldWithACL(
+            identifier=shield_id,
+            provider_resource_id=provider_shield_id,
+            provider_id=provider_id,
+            params=params,
+        )
+        await self.register_object(shield)
+        return shield
diff --git a/llama_stack/distribution/routing_tables/toolgroups.py b/llama_stack/distribution/routing_tables/toolgroups.py
new file mode 100644
index 000000000..2f7dc3e06
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/toolgroups.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups
+from llama_stack.distribution.datatypes import ToolGroupWithACL
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:
+    # handle the funny case like "builtin::rag/knowledge_search"
+    parts = toolgroup_name_with_maybe_tool_name.split("/")
+    if len(parts) == 2:
+        return parts[0]
+    else:
+        return None
+
+
+class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
+    toolgroups_to_tools: dict[str, list[Tool]] = {}
+    tool_to_toolgroup: dict[str, str] = {}
+
+    # overridden
+    def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
+        # we don't index tools in the registry anymore, but only keep a cache of them by toolgroup_id
+        # TODO: we may want to invalidate the cache (for a given toolgroup_id) every once in a while?
+
+        toolgroup_id = parse_toolgroup_from_toolgroup_name_pair(routing_key)
+        if toolgroup_id:
+            routing_key = toolgroup_id
+
+        if routing_key in self.tool_to_toolgroup:
+            routing_key = self.tool_to_toolgroup[routing_key]
+        return super().get_provider_impl(routing_key, provider_id)
+
+    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
+        if toolgroup_id:
+            if group_id := parse_toolgroup_from_toolgroup_name_pair(toolgroup_id):
+                toolgroup_id = group_id
+            toolgroups = [await self.get_tool_group(toolgroup_id)]
+        else:
+            toolgroups = await self.get_all_with_type("tool_group")
+
+        all_tools = []
+        for toolgroup in toolgroups:
+            if toolgroup.identifier not in self.toolgroups_to_tools:
+                await self._index_tools(toolgroup)
+            all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier])
+
+        return ListToolsResponse(data=all_tools)
+
+    async def _index_tools(self, toolgroup: ToolGroup):
+        provider_impl = super().get_provider_impl(toolgroup.identifier, toolgroup.provider_id)
+        tooldefs_response = await provider_impl.list_runtime_tools(toolgroup.identifier, toolgroup.mcp_endpoint)
+
+        # TODO: kill this Tool vs ToolDef distinction
+        tooldefs = tooldefs_response.data
+        tools = []
+        for t in tooldefs:
+            tools.append(
+                Tool(
+                    identifier=t.name,
+                    toolgroup_id=toolgroup.identifier,
+                    description=t.description or "",
+                    parameters=t.parameters or [],
+                    metadata=t.metadata,
+                    provider_id=toolgroup.provider_id,
+                )
+            )
+
+        self.toolgroups_to_tools[toolgroup.identifier] = tools
+        for tool in tools:
+            self.tool_to_toolgroup[tool.identifier] = toolgroup.identifier
+
+    async def list_tool_groups(self) -> ListToolGroupsResponse:
+        return ListToolGroupsResponse(data=await self.get_all_with_type("tool_group"))
+
+    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
+        tool_group = await self.get_object_by_identifier("tool_group", toolgroup_id)
+        if tool_group is None:
+            raise ValueError(f"Tool group '{toolgroup_id}' not found")
+        return tool_group
+
+    async def get_tool(self, tool_name: str) -> Tool:
+        if tool_name in self.tool_to_toolgroup:
+            toolgroup_id = self.tool_to_toolgroup[tool_name]
+            tools = self.toolgroups_to_tools[toolgroup_id]
+            for tool in tools:
+                if tool.identifier == tool_name:
+                    return tool
+        raise ValueError(f"Tool '{tool_name}' not found")
+
+    async def register_tool_group(
+        self,
+        toolgroup_id: str,
+        provider_id: str,
+        mcp_endpoint: URL | None = None,
+        args: dict[str, Any] | None = None,
+    ) -> None:
+        toolgroup = ToolGroupWithACL(
+            identifier=toolgroup_id,
+            provider_id=provider_id,
+            provider_resource_id=toolgroup_id,
+            mcp_endpoint=mcp_endpoint,
+            args=args,
+        )
+        await self.register_object(toolgroup)
+
+        # ideally, indexing of the tools should not be necessary because anyone using
+        # the tools should first list the tools and then use them. but there are assumptions
+        # baked in some of the code and tests right now.
+        if not toolgroup.mcp_endpoint:
+            await self._index_tools(toolgroup)
+        return toolgroup
+
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
+        tool_group = await self.get_tool_group(toolgroup_id)
+        if tool_group is None:
+            raise ValueError(f"Tool group {toolgroup_id} not found")
+        await self.unregister_object(tool_group)
+
+    async def shutdown(self) -> None:
+        pass
diff --git a/llama_stack/distribution/routing_tables/vector_dbs.py b/llama_stack/distribution/routing_tables/vector_dbs.py
new file mode 100644
index 000000000..dc6c0d0ef
--- /dev/null
+++ b/llama_stack/distribution/routing_tables/vector_dbs.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import TypeAdapter
+
+from llama_stack.apis.models import ModelType
+from llama_stack.apis.resource import ResourceType
+from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
+from llama_stack.distribution.datatypes import (
+    VectorDBWithACL,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl
+
+logger = get_logger(name=__name__, category="core")
+
+
+class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
+    async def list_vector_dbs(self) -> ListVectorDBsResponse:
+        return ListVectorDBsResponse(data=await self.get_all_with_type("vector_db"))
+
+    async def get_vector_db(self, vector_db_id: str) -> VectorDB:
+        vector_db = await self.get_object_by_identifier("vector_db", vector_db_id)
+        if vector_db is None:
+            raise ValueError(f"Vector DB '{vector_db_id}' not found")
+        return vector_db
+
+    async def register_vector_db(
+        self,
+        vector_db_id: str,
+        embedding_model: str,
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        provider_vector_db_id: str | None = None,
+    ) -> VectorDB:
+        if provider_vector_db_id is None:
+            provider_vector_db_id = vector_db_id
+        if provider_id is None:
+            if len(self.impls_by_provider_id) > 0:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+                if len(self.impls_by_provider_id) > 1:
+                    logger.warning(
+                        f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
+                    )
+            else:
+                raise ValueError("No provider available. Please configure a vector_io provider.")
+        model = await self.get_object_by_identifier("model", embedding_model)
+        if model is None:
+            raise ValueError(f"Model {embedding_model} not found")
+        if model.model_type != ModelType.embedding:
+            raise ValueError(f"Model {embedding_model} is not an embedding model")
+        if "embedding_dimension" not in model.metadata:
+            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
+        vector_db_data = {
+            "identifier": vector_db_id,
+            "type": ResourceType.vector_db.value,
+            "provider_id": provider_id,
+            "provider_resource_id": provider_vector_db_id,
+            "embedding_model": embedding_model,
+            "embedding_dimension": model.metadata["embedding_dimension"],
+        }
+        vector_db = TypeAdapter(VectorDBWithACL).validate_python(vector_db_data)
+        await self.register_object(vector_db)
+        return vector_db
+
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        existing_vector_db = await self.get_vector_db(vector_db_id)
+        if existing_vector_db is None:
+            raise ValueError(f"Vector DB {vector_db_id} not found")
+        await self.unregister_object(existing_vector_db)
diff --git a/llama_stack/distribution/server/auth.py b/llama_stack/distribution/server/auth.py
index 52e6a013c..fb26b49a7 100644
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@@ -5,74 +5,30 @@
 # the root directory of this source tree.
 
 import json
-from typing import Dict, List, Optional
-from urllib.parse import parse_qs
 
 import httpx
-from pydantic import BaseModel, Field
 
-from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.datatypes import AuthenticationConfig
+from llama_stack.distribution.server.auth_providers import create_auth_provider
 from llama_stack.log import get_logger
 
 logger = get_logger(name=__name__, category="auth")
 
 
-class AuthRequestContext(BaseModel):
-    path: str = Field(description="The path of the request being authenticated")
-
-    headers: Dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
-
-    params: Dict[str, List[str]] = Field(
-        description="Query parameters from the original request, parsed as dictionary of lists"
-    )
-
-
-class AuthRequest(BaseModel):
-    api_key: str = Field(description="The API key extracted from the Authorization header")
-
-    request: AuthRequestContext = Field(description="Context information about the request being authenticated")
-
-
-class AuthResponse(BaseModel):
-    """The format of the authentication response from the auth endpoint."""
-
-    access_attributes: Optional[AccessAttributes] = Field(
-        default=None,
-        description="""
-        Structured user attributes for attribute-based access control.
-
-        These attributes determine which resources the user can access.
-        The model provides standard categories like "roles", "teams", "projects", and "namespaces".
-        Each attribute category contains a list of values that the user has for that category.
-        During access control checks, these values are compared against resource requirements.
-
-        Example with standard categories:
-        ```json
-        {
-            "roles": ["admin", "data-scientist"],
-            "teams": ["ml-team"],
-            "projects": ["llama-3"],
-            "namespaces": ["research"]
-        }
-        ```
-        """,
-    )
-
-    message: Optional[str] = Field(
-        default=None, description="Optional message providing additional context about the authentication result."
-    )
-
-
 class AuthenticationMiddleware:
-    """Middleware that authenticates requests using an external auth endpoint.
+    """Middleware that authenticates requests using configured authentication provider.
 
     This middleware:
     1. Extracts the Bearer token from the Authorization header
-    2. Sends it to the configured auth endpoint along with request details
-    3. Validates the response and extracts user attributes
+    2. Uses the configured auth provider to validate the token
+    3. Extracts user attributes from the provider's response
     4. Makes these attributes available to the route handlers for access control
 
-    Authentication Request Format:
+    The middleware supports multiple authentication providers through the AuthProvider interface:
+    - Kubernetes: Validates tokens against the Kubernetes API server
+    - Custom: Validates tokens against a custom endpoint
+
+    Authentication Request Format for Custom Auth Provider:
     ```json
     {
         "api_key": "the-api-key-extracted-from-auth-header",
@@ -105,21 +61,26 @@ class AuthenticationMiddleware:
     }
     ```
 
+    Token Validation:
+    Each provider implements its own token validation logic:
+    - Kubernetes: Uses TokenReview API to validate service account tokens
+    - Custom: Sends token to custom endpoint for validation
+
     Attribute-Based Access Control:
-    The attributes returned by the auth endpoint are used to determine which
+    The attributes returned by the auth provider are used to determine which
     resources the user can access. Resources can specify required attributes
     using the access_attributes field. For a user to access a resource:
 
     1. All attribute categories specified in the resource must be present in the user's attributes
     2. For each category, the user must have at least one matching value
 
-    If the auth endpoint doesn't return any attributes, the user will only be able to
+    If the auth provider doesn't return any attributes, the user will only be able to
     access resources that don't have access_attributes defined.
     """
 
-    def __init__(self, app, auth_endpoint):
+    def __init__(self, app, auth_config: AuthenticationConfig):
         self.app = app
-        self.auth_endpoint = auth_endpoint
+        self.auth_provider = create_auth_provider(auth_config)
 
     async def __call__(self, scope, receive, send):
         if scope["type"] == "http":
@@ -129,66 +90,41 @@ class AuthenticationMiddleware:
             if not auth_header or not auth_header.startswith("Bearer "):
                 return await self._send_auth_error(send, "Missing or invalid Authorization header")
 
-            api_key = auth_header.split("Bearer ", 1)[1]
+            token = auth_header.split("Bearer ", 1)[1]
 
-            path = scope.get("path", "")
-            request_headers = {k.decode(): v.decode() for k, v in headers.items()}
-
-            # Remove sensitive headers
-            if "authorization" in request_headers:
-                del request_headers["authorization"]
-
-            query_string = scope.get("query_string", b"").decode()
-            params = parse_qs(query_string)
-
-            # Build the auth request model
-            auth_request = AuthRequest(
-                api_key=api_key,
-                request=AuthRequestContext(
-                    path=path,
-                    headers=request_headers,
-                    params=params,
-                ),
-            )
-
-            # Validate with authentication endpoint
+            # Validate token and get access attributes
             try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.post(
-                        self.auth_endpoint,
-                        json=auth_request.model_dump(),
-                        timeout=10.0,  # Add a reasonable timeout
-                    )
-                    if response.status_code != 200:
-                        logger.warning(f"Authentication failed: {response.status_code}")
-                        return await self._send_auth_error(send, "Authentication failed")
-
-                    # Parse and validate the auth response
-                    try:
-                        response_data = response.json()
-                        auth_response = AuthResponse(**response_data)
-
-                        # Store attributes in request scope for access control
-                        if auth_response.access_attributes:
-                            user_attributes = auth_response.access_attributes.model_dump(exclude_none=True)
-                        else:
-                            logger.warning("No access attributes, setting namespace to api_key by default")
-                            user_attributes = {
-                                "namespaces": [api_key],
-                            }
-
-                        scope["user_attributes"] = user_attributes
-                        logger.debug(f"Authentication successful: {len(user_attributes)} attributes")
-                    except Exception:
-                        logger.exception("Error parsing authentication response")
-                        return await self._send_auth_error(send, "Invalid authentication response format")
+                validation_result = await self.auth_provider.validate_token(token, scope)
             except httpx.TimeoutException:
                 logger.exception("Authentication request timed out")
                 return await self._send_auth_error(send, "Authentication service timeout")
+            except ValueError as e:
+                logger.exception("Error during authentication")
+                return await self._send_auth_error(send, str(e))
             except Exception:
                 logger.exception("Error during authentication")
                 return await self._send_auth_error(send, "Authentication service error")
 
+            # Store attributes in request scope for access control
+            if validation_result.access_attributes:
+                user_attributes = validation_result.access_attributes.model_dump(exclude_none=True)
+            else:
+                logger.warning("No access attributes, setting namespace to token by default")
+                user_attributes = {
+                    "roles": [token],
+                }
+
+            # Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
+            # can identify the requester and enforce per-client rate limits.
+            scope["authenticated_client_id"] = token
+
+            # Store attributes in request scope
+            scope["user_attributes"] = user_attributes
+            scope["principal"] = validation_result.principal
+            logger.debug(
+                f"Authentication successful: {validation_result.principal} with {len(scope['user_attributes'])} attributes"
+            )
+
         return await self.app(scope, receive, send)
 
     async def _send_auth_error(self, send, message):
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
new file mode 100644
index 000000000..723a65b77
--- /dev/null
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -0,0 +1,376 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import ssl
+import time
+from abc import ABC, abstractmethod
+from asyncio import Lock
+from pathlib import Path
+from urllib.parse import parse_qs
+
+import httpx
+from jose import jwt
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import Self
+
+from llama_stack.distribution.datatypes import AccessAttributes, AuthenticationConfig, AuthProviderType
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="auth")
+
+
+class TokenValidationResult(BaseModel):
+    principal: str | None = Field(
+        default=None,
+        description="The principal (username or persistent identifier) of the authenticated user",
+    )
+    access_attributes: AccessAttributes | None = Field(
+        default=None,
+        description="""
+        Structured user attributes for attribute-based access control.
+
+        These attributes determine which resources the user can access.
+        The model provides standard categories like "roles", "teams", "projects", and "namespaces".
+        Each attribute category contains a list of values that the user has for that category.
+        During access control checks, these values are compared against resource requirements.
+
+        Example with standard categories:
+        ```json
+        {
+            "roles": ["admin", "data-scientist"],
+            "teams": ["ml-team"],
+            "projects": ["llama-3"],
+            "namespaces": ["research"]
+        }
+        ```
+        """,
+    )
+
+
+class AuthResponse(TokenValidationResult):
+    """The format of the authentication response from the auth endpoint."""
+
+    message: str | None = Field(
+        default=None, description="Optional message providing additional context about the authentication result."
+    )
+
+
+class AuthRequestContext(BaseModel):
+    path: str = Field(description="The path of the request being authenticated")
+
+    headers: dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
+
+    params: dict[str, list[str]] = Field(
+        description="Query parameters from the original request, parsed as dictionary of lists"
+    )
+
+
+class AuthRequest(BaseModel):
+    api_key: str = Field(description="The API key extracted from the Authorization header")
+
+    request: AuthRequestContext = Field(description="Context information about the request being authenticated")
+
+
+class AuthProvider(ABC):
+    """Abstract base class for authentication providers."""
+
+    @abstractmethod
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token and return access attributes."""
+        pass
+
+    @abstractmethod
+    async def close(self):
+        """Clean up any resources."""
+        pass
+
+
+def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> AccessAttributes:
+    attributes = AccessAttributes()
+    for claim_key, attribute_key in mapping.items():
+        if claim_key not in claims or not hasattr(attributes, attribute_key):
+            continue
+        claim = claims[claim_key]
+        if isinstance(claim, list):
+            values = claim
+        else:
+            values = claim.split()
+
+        current = getattr(attributes, attribute_key)
+        if current:
+            current.extend(values)
+        else:
+            setattr(attributes, attribute_key, values)
+    return attributes
+
+
+class OAuth2JWKSConfig(BaseModel):
+    # The JWKS URI for collecting public keys
+    uri: str
+    key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
+
+
+class OAuth2IntrospectionConfig(BaseModel):
+    url: str
+    client_id: str
+    client_secret: str
+    send_secret_in_body: bool = False
+
+
+class OAuth2TokenAuthProviderConfig(BaseModel):
+    audience: str = "llama-stack"
+    verify_tls: bool = True
+    tls_cafile: Path | None = None
+    issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
+    claims_mapping: dict[str, str] = Field(
+        default_factory=lambda: {
+            "sub": "roles",
+            "username": "roles",
+            "groups": "teams",
+            "team": "teams",
+            "project": "projects",
+            "tenant": "namespaces",
+            "namespace": "namespaces",
+        },
+    )
+    jwks: OAuth2JWKSConfig | None
+    introspection: OAuth2IntrospectionConfig | None = None
+
+    @classmethod
+    @field_validator("claims_mapping")
+    def validate_claims_mapping(cls, v):
+        for key, value in v.items():
+            if not value:
+                raise ValueError(f"claims_mapping value cannot be empty: {key}")
+            if value not in AccessAttributes.model_fields:
+                raise ValueError(f"claims_mapping value is not a valid attribute: {value}")
+        return v
+
+    @model_validator(mode="after")
+    def validate_mode(self) -> Self:
+        if not self.jwks and not self.introspection:
+            raise ValueError("One of jwks or introspection must be configured")
+        if self.jwks and self.introspection:
+            raise ValueError("At present only one of jwks or introspection should be configured")
+        return self
+
+
+class OAuth2TokenAuthProvider(AuthProvider):
+    """
+    JWT token authentication provider that validates a JWT token and extracts access attributes.
+
+    This should be the standard authentication provider for most use cases.
+    """
+
+    def __init__(self, config: OAuth2TokenAuthProviderConfig):
+        self.config = config
+        self._jwks_at: float = 0.0
+        self._jwks: dict[str, str] = {}
+        self._jwks_lock = Lock()
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        if self.config.jwks:
+            return await self.validate_jwt_token(token, scope)
+        if self.config.introspection:
+            return await self.introspect_token(token, scope)
+        raise ValueError("One of jwks or introspection must be configured")
+
+    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token using the JWT token."""
+        await self._refresh_jwks()
+
+        try:
+            header = jwt.get_unverified_header(token)
+            kid = header["kid"]
+            if kid not in self._jwks:
+                raise ValueError(f"Unknown key ID: {kid}")
+            key_data = self._jwks[kid]
+            algorithm = header.get("alg", "RS256")
+            claims = jwt.decode(
+                token,
+                key_data,
+                algorithms=[algorithm],
+                audience=self.config.audience,
+                issuer=self.config.issuer,
+            )
+        except Exception as exc:
+            raise ValueError(f"Invalid JWT token: {token}") from exc
+
+        # There are other standard claims, the most relevant of which is `scope`.
+        # We should incorporate these into the access attributes.
+        principal = claims["sub"]
+        access_attributes = get_attributes_from_claims(claims, self.config.claims_mapping)
+        return TokenValidationResult(
+            principal=principal,
+            access_attributes=access_attributes,
+        )
+
+    async def introspect_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token using token introspection as defined by RFC 7662."""
+        form = {
+            "token": token,
+        }
+        if self.config.introspection is None:
+            raise ValueError("Introspection is not configured")
+
+        if self.config.introspection.send_secret_in_body:
+            form["client_id"] = self.config.introspection.client_id
+            form["client_secret"] = self.config.introspection.client_secret
+            auth = None
+        else:
+            auth = (self.config.introspection.client_id, self.config.introspection.client_secret)
+        ssl_ctxt = None
+        if self.config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
+        try:
+            async with httpx.AsyncClient(verify=ssl_ctxt) as client:
+                response = await client.post(
+                    self.config.introspection.url,
+                    data=form,
+                    auth=auth,
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != 200:
+                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
+                    raise ValueError(f"Token introspection failed: {response.status_code}")
+
+                fields = response.json()
+                if not fields["active"]:
+                    raise ValueError("Token not active")
+                principal = fields["sub"] or fields["username"]
+                access_attributes = get_attributes_from_claims(fields, self.config.claims_mapping)
+                return TokenValidationResult(
+                    principal=principal,
+                    access_attributes=access_attributes,
+                )
+        except httpx.TimeoutException:
+            logger.exception("Token introspection request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during token introspection")
+            raise ValueError("Token introspection error") from e
+
+    async def close(self):
+        pass
+
+    async def _refresh_jwks(self) -> None:
+        """
+        Refresh the JWKS cache.
+
+        This is a simple cache that expires after a certain amount of time (defined by `key_recheck_period`).
+        If the cache is expired, we refresh the JWKS from the JWKS URI.
+
+        Notes: for Kubernetes which doesn't fully implement the OIDC protocol:
+            * It doesn't have user authentication flows
+            * It doesn't have refresh tokens
+        """
+        async with self._jwks_lock:
+            if self.config.jwks is None:
+                raise ValueError("JWKS is not configured")
+            if time.time() - self._jwks_at > self.config.jwks.key_recheck_period:
+                verify = self.config.tls_cafile.as_posix() if self.config.tls_cafile else self.config.verify_tls
+                async with httpx.AsyncClient(verify=verify) as client:
+                    res = await client.get(self.config.jwks.uri, timeout=5)
+                    res.raise_for_status()
+                    jwks_data = res.json()["keys"]
+                    updated = {}
+                    for k in jwks_data:
+                        kid = k["kid"]
+                        # Store the entire key object as it may be needed for different algorithms
+                        updated[kid] = k
+                    self._jwks = updated
+                    self._jwks_at = time.time()
+
+
+class CustomAuthProviderConfig(BaseModel):
+    endpoint: str
+
+
+class CustomAuthProvider(AuthProvider):
+    """Custom authentication provider that uses an external endpoint."""
+
+    def __init__(self, config: CustomAuthProviderConfig):
+        self.config = config
+        self._client = None
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token using the custom authentication endpoint."""
+        if scope is None:
+            scope = {}
+
+        headers = dict(scope.get("headers", []))
+        path = scope.get("path", "")
+        request_headers = {k.decode(): v.decode() for k, v in headers.items()}
+
+        # Remove sensitive headers
+        if "authorization" in request_headers:
+            del request_headers["authorization"]
+
+        query_string = scope.get("query_string", b"").decode()
+        params = parse_qs(query_string)
+
+        # Build the auth request model
+        auth_request = AuthRequest(
+            api_key=token,
+            request=AuthRequestContext(
+                path=path,
+                headers=request_headers,
+                params=params,
+            ),
+        )
+
+        # Validate with authentication endpoint
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.post(
+                    self.config.endpoint,
+                    json=auth_request.model_dump(),
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != 200:
+                    logger.warning(f"Authentication failed with status code: {response.status_code}")
+                    raise ValueError(f"Authentication failed: {response.status_code}")
+
+                # Parse and validate the auth response
+                try:
+                    response_data = response.json()
+                    auth_response = AuthResponse(**response_data)
+                    return auth_response
+                except Exception as e:
+                    logger.exception("Error parsing authentication response")
+                    raise ValueError("Invalid authentication response format") from e
+
+        except httpx.TimeoutException:
+            logger.exception("Authentication request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during authentication")
+            raise ValueError("Authentication service error") from e
+
+    async def close(self):
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+
+
+def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
+    """Factory function to create the appropriate auth provider."""
+    provider_type = config.provider_type.lower()
+
+    if provider_type == "custom":
+        return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
+    elif provider_type == "oauth2_token":
+        return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
+    else:
+        supported_providers = ", ".join([t.value for t in AuthProviderType])
+        raise ValueError(f"Unsupported auth provider type: {provider_type}. Supported types are: {supported_providers}")
diff --git a/llama_stack/distribution/server/quota.py b/llama_stack/distribution/server/quota.py
new file mode 100644
index 000000000..ddbffae64
--- /dev/null
+++ b/llama_stack/distribution/server/quota.py
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import time
+from datetime import datetime, timedelta, timezone
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore.api import KVStore
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
+
+logger = get_logger(name=__name__, category="quota")
+
+
+class QuotaMiddleware:
+    """
+    ASGI middleware that enforces separate quotas for authenticated and anonymous clients
+    within a configurable time window.
+
+    - For authenticated requests, it reads the client ID from the
+      `Authorization: Bearer ` header.
+    - For anonymous requests, it falls back to the IP address of the client.
+    Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
+    once a client exceeds its quota.
+    """
+
+    def __init__(
+        self,
+        app: ASGIApp,
+        kv_config: KVStoreConfig,
+        anonymous_max_requests: int,
+        authenticated_max_requests: int,
+        window_seconds: int = 86400,
+    ):
+        self.app = app
+        self.kv_config = kv_config
+        self.kv: KVStore | None = None
+        self.anonymous_max_requests = anonymous_max_requests
+        self.authenticated_max_requests = authenticated_max_requests
+        self.window_seconds = window_seconds
+
+        if isinstance(self.kv_config, SqliteKVStoreConfig):
+            logger.warning(
+                "QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
+                f"window_seconds={self.window_seconds}"
+            )
+
+    async def _get_kv(self) -> KVStore:
+        if self.kv is None:
+            self.kv = await kvstore_impl(self.kv_config)
+        return self.kv
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] == "http":
+            # pick key & limit based on auth
+            auth_id = scope.get("authenticated_client_id")
+            if auth_id:
+                key_id = auth_id
+                limit = self.authenticated_max_requests
+            else:
+                # fallback to IP
+                client = scope.get("client")
+                key_id = client[0] if client else "anonymous"
+                limit = self.anonymous_max_requests
+
+            current_window = int(time.time() // self.window_seconds)
+            key = f"quota:{key_id}:{current_window}"
+
+            try:
+                kv = await self._get_kv()
+                prev = await kv.get(key) or "0"
+                count = int(prev) + 1
+
+                if int(prev) == 0:
+                    # Set with expiration datetime when it is the first request in the window.
+                    expiration = datetime.now(timezone.utc) + timedelta(seconds=self.window_seconds)
+                    await kv.set(key, str(count), expiration=expiration)
+                else:
+                    await kv.set(key, str(count))
+            except Exception:
+                logger.exception("Failed to access KV store for quota")
+                return await self._send_error(send, 500, "Quota service error")
+
+            if count > limit:
+                logger.warning(
+                    "Quota exceeded for client %s: %d/%d",
+                    key_id,
+                    count,
+                    limit,
+                )
+                return await self._send_error(send, 429, "Quota exceeded")
+
+        return await self.app(scope, receive, send)
+
+    async def _send_error(self, send: Send, status: int, message: str):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        body = json.dumps({"error": {"message": message}}).encode()
+        await send({"type": "http.response.body", "body": body})
diff --git a/llama_stack/distribution/server/endpoints.py b/llama_stack/distribution/server/routes.py
similarity index 55%
rename from llama_stack/distribution/server/endpoints.py
rename to llama_stack/distribution/server/routes.py
index 98f01c067..ea66fec5a 100644
--- a/llama_stack/distribution/server/endpoints.py
+++ b/llama_stack/distribution/server/routes.py
@@ -6,21 +6,23 @@
 
 import inspect
 import re
-from typing import Dict, List
+from collections.abc import Callable
+from typing import Any
 
-from pydantic import BaseModel
+from aiohttp import hdrs
+from starlette.routing import Route
 
 from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from llama_stack.distribution.resolver import api_protocol_map
 from llama_stack.providers.datatypes import Api
 
-
-class ApiEndpoint(BaseModel):
-    route: str
-    method: str
-    name: str
-    descriptive_name: str | None = None
+EndpointFunc = Callable[..., Any]
+PathParams = dict[str, str]
+RouteInfo = tuple[EndpointFunc, str]
+PathImpl = dict[str, RouteInfo]
+RouteImpls = dict[str, PathImpl]
+RouteMatch = tuple[EndpointFunc, PathParams, str]
 
 
 def toolgroup_protocol_map():
@@ -29,13 +31,13 @@ def toolgroup_protocol_map():
     }
 
 
-def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
+def get_all_api_routes() -> dict[Api, list[Route]]:
     apis = {}
 
     protocols = api_protocol_map()
     toolgroup_protocols = toolgroup_protocol_map()
     for api, protocol in protocols.items():
-        endpoints = []
+        routes = []
         protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
 
         # HACK ALERT
@@ -52,26 +54,28 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
             if not hasattr(method, "__webmethod__"):
                 continue
 
-            webmethod = method.__webmethod__
-            route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
-            if webmethod.method == "GET":
-                method = "get"
-            elif webmethod.method == "DELETE":
-                method = "delete"
+            # The __webmethod__ attribute is dynamically added by the @webmethod decorator
+            # mypy doesn't know about this dynamic attribute, so we ignore the attr-defined error
+            webmethod = method.__webmethod__  # type: ignore[attr-defined]
+            path = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
+            if webmethod.method == hdrs.METH_GET:
+                http_method = hdrs.METH_GET
+            elif webmethod.method == hdrs.METH_DELETE:
+                http_method = hdrs.METH_DELETE
             else:
-                method = "post"
-            endpoints.append(
-                ApiEndpoint(route=route, method=method, name=name, descriptive_name=webmethod.descriptive_name)
-            )
+                http_method = hdrs.METH_POST
+            routes.append(
+                Route(path=path, methods=[http_method], name=name, endpoint=None)
+            )  # setting endpoint to None since don't use a Router object
 
-        apis[api] = endpoints
+        apis[api] = routes
 
     return apis
 
 
-def initialize_endpoint_impls(impls):
-    endpoints = get_all_api_endpoints()
-    endpoint_impls = {}
+def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:
+    routes = get_all_api_routes()
+    route_impls: RouteImpls = {}
 
     def _convert_path_to_regex(path: str) -> str:
         # Convert {param} to named capture groups
@@ -84,29 +88,34 @@ def initialize_endpoint_impls(impls):
 
         return f"^{pattern}$"
 
-    for api, api_endpoints in endpoints.items():
+    for api, api_routes in routes.items():
         if api not in impls:
             continue
-        for endpoint in api_endpoints:
+        for route in api_routes:
             impl = impls[api]
-            func = getattr(impl, endpoint.name)
-            if endpoint.method not in endpoint_impls:
-                endpoint_impls[endpoint.method] = {}
-            endpoint_impls[endpoint.method][_convert_path_to_regex(endpoint.route)] = (
+            func = getattr(impl, route.name)
+            # Get the first (and typically only) method from the set, filtering out HEAD
+            available_methods = [m for m in route.methods if m != "HEAD"]
+            if not available_methods:
+                continue  # Skip if only HEAD method is available
+            method = available_methods[0].lower()
+            if method not in route_impls:
+                route_impls[method] = {}
+            route_impls[method][_convert_path_to_regex(route.path)] = (
                 func,
-                endpoint.descriptive_name or endpoint.route,
+                route.path,
             )
 
-    return endpoint_impls
+    return route_impls
 
 
-def find_matching_endpoint(method, path, endpoint_impls):
+def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> RouteMatch:
     """Find the matching endpoint implementation for a given method and path.
 
     Args:
         method: HTTP method (GET, POST, etc.)
         path: URL path to match against
-        endpoint_impls: A dictionary of endpoint implementations
+        route_impls: A dictionary of endpoint implementations
 
     Returns:
         A tuple of (endpoint_function, path_params, descriptive_name)
@@ -114,7 +123,7 @@ def find_matching_endpoint(method, path, endpoint_impls):
     Raises:
         ValueError: If no matching endpoint is found
     """
-    impls = endpoint_impls.get(method.lower())
+    impls = route_impls.get(method.lower())
     if not impls:
         raise ValueError(f"No endpoint found for {path}")
 
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 6e9941d1c..6c88bbfe9 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -6,36 +6,42 @@
 
 import argparse
 import asyncio
+import functools
 import inspect
 import json
 import os
+import ssl
 import sys
 import traceback
 import warnings
+from collections.abc import Callable
 from contextlib import asynccontextmanager
 from importlib.metadata import version as parse_version
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Annotated, Any
 
+import rich.pretty
 import yaml
+from aiohttp import hdrs
 from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError
-from typing_extensions import Annotated
 
-from llama_stack.distribution.datatypes import LoggingConfig, StackRunConfig
+from llama_stack.distribution.datatypes import AuthenticationRequiredError, LoggingConfig, StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.request_headers import (
     PROVIDER_DATA_VAR,
     request_provider_data_context,
 )
 from llama_stack.distribution.resolver import InvalidProviderError
-from llama_stack.distribution.server.endpoints import (
-    find_matching_endpoint,
-    initialize_endpoint_impls,
+from llama_stack.distribution.server.routes import (
+    find_matching_route,
+    get_all_api_routes,
+    initialize_route_impls,
 )
 from llama_stack.distribution.stack import (
     construct_stack,
@@ -58,7 +64,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
 )
 
 from .auth import AuthenticationMiddleware
-from .endpoints import get_all_api_endpoints
+from .quota import QuotaMiddleware
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
@@ -91,7 +97,7 @@ async def global_exception_handler(request: Request, exc: Exception):
     return JSONResponse(status_code=http_exc.status_code, content={"error": {"detail": http_exc.detail}})
 
 
-def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidationError]:
+def translate_exception(exc: Exception) -> HTTPException | RequestValidationError:
     if isinstance(exc, ValidationError):
         exc = RequestValidationError(exc.errors())
 
@@ -115,10 +121,12 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio
         return HTTPException(status_code=400, detail=str(exc))
     elif isinstance(exc, PermissionError):
         return HTTPException(status_code=403, detail=f"Permission denied: {str(exc)}")
-    elif isinstance(exc, TimeoutError):
+    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
         return HTTPException(status_code=504, detail=f"Operation timed out: {str(exc)}")
     elif isinstance(exc, NotImplementedError):
         return HTTPException(status_code=501, detail=f"Not implemented: {str(exc)}")
+    elif isinstance(exc, AuthenticationRequiredError):
+        return HTTPException(status_code=401, detail=f"Authentication required: {str(exc)}")
     else:
         return HTTPException(
             status_code=500,
@@ -140,7 +148,7 @@ async def shutdown(app):
                 await asyncio.wait_for(impl.shutdown(), timeout=5)
             else:
                 logger.warning("No shutdown method for %s", impl_name)
-        except asyncio.TimeoutError:
+        except (asyncio.TimeoutError, TimeoutError):
             logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
         except (Exception, asyncio.CancelledError) as e:
             logger.exception("Failed to shutdown %s: %s", impl_name, {e})
@@ -187,11 +195,31 @@ async def sse_generator(event_gen_coroutine):
         )
 
 
-def create_dynamic_typed_route(func: Any, method: str, route: str):
-    async def endpoint(request: Request, **kwargs):
+async def log_request_pre_validation(request: Request):
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body_bytes = await request.body()
+            if body_bytes:
+                try:
+                    parsed_body = json.loads(body_bytes.decode())
+                    log_output = rich.pretty.pretty_repr(parsed_body)
+                except (json.JSONDecodeError, UnicodeDecodeError):
+                    log_output = repr(body_bytes)
+                logger.debug(f"Incoming raw request body for {request.method} {request.url.path}:\n{log_output}")
+            else:
+                logger.debug(f"Incoming {request.method} {request.url.path} request with empty body.")
+        except Exception as e:
+            logger.warning(f"Could not read or log request body for {request.method} {request.url.path}: {e}")
+
+
+def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
+    @functools.wraps(func)
+    async def route_handler(request: Request, **kwargs):
         # Get auth attributes from the request scope
         user_attributes = request.scope.get("user_attributes", {})
 
+        await log_request_pre_validation(request)
+
         # Use context manager with both provider data and auth attributes
         with request_provider_data_context(request.headers, user_attributes):
             is_streaming = is_streaming_request(func.__name__, request, **kwargs)
@@ -226,9 +254,9 @@ def create_dynamic_typed_route(func: Any, method: str, route: str):
             for param in new_params[1:]
         ]
 
-    endpoint.__signature__ = sig.replace(parameters=new_params)
+    route_handler.__signature__ = sig.replace(parameters=new_params)
 
-    return endpoint
+    return route_handler
 
 
 class TracingMiddleware:
@@ -250,17 +278,28 @@ class TracingMiddleware:
             logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
             return await self.app(scope, receive, send)
 
-        if not hasattr(self, "endpoint_impls"):
-            self.endpoint_impls = initialize_endpoint_impls(self.impls)
+        if not hasattr(self, "route_impls"):
+            self.route_impls = initialize_route_impls(self.impls)
 
         try:
-            _, _, trace_path = find_matching_endpoint(scope.get("method", "GET"), path, self.endpoint_impls)
+            _, _, trace_path = find_matching_route(scope.get("method", hdrs.METH_GET), path, self.route_impls)
         except ValueError:
             # If no matching endpoint is found, pass through to FastAPI
-            logger.debug(f"No matching endpoint found for path: {path}, falling back to FastAPI")
+            logger.debug(f"No matching route found for path: {path}, falling back to FastAPI")
             return await self.app(scope, receive, send)
 
-        trace_context = await start_trace(trace_path, {"__location__": "server", "raw_path": path})
+        trace_attributes = {"__location__": "server", "raw_path": path}
+
+        # Extract W3C trace context headers and store as trace attributes
+        headers = dict(scope.get("headers", []))
+        traceparent = headers.get(b"traceparent", b"").decode()
+        if traceparent:
+            trace_attributes["traceparent"] = traceparent
+        tracestate = headers.get(b"tracestate", b"").decode()
+        if tracestate:
+            trace_attributes["tracestate"] = tracestate
+
+        trace_context = await start_trace(trace_path, trace_attributes)
 
         async def send_with_trace_id(message):
             if message["type"] == "http.response.start":
@@ -315,7 +354,7 @@ class ClientVersionMiddleware:
         return await self.app(scope, receive, send)
 
 
-def main(args: Optional[argparse.Namespace] = None):
+def main(args: argparse.Namespace | None = None):
     """Start the LlamaStack server."""
     parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
     parser.add_argument(
@@ -338,22 +377,11 @@ def main(args: Optional[argparse.Namespace] = None):
         default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
         help="Port to listen on",
     )
-    parser.add_argument("--disable-ipv6", action="store_true", help="Whether to disable IPv6 support")
     parser.add_argument(
         "--env",
         action="append",
         help="Environment variables in KEY=value format. Can be specified multiple times.",
     )
-    parser.add_argument(
-        "--tls-keyfile",
-        help="Path to TLS key file for HTTPS",
-        required="--tls-certfile" in sys.argv,
-    )
-    parser.add_argument(
-        "--tls-certfile",
-        help="Path to TLS certificate file for HTTPS",
-        required="--tls-keyfile" in sys.argv,
-    )
 
     # Determine whether the server args are being passed by the "run" command, if this is the case
     # the args will be passed as a Namespace object to the main function, otherwise they will be
@@ -361,14 +389,6 @@ def main(args: Optional[argparse.Namespace] = None):
     if args is None:
         args = parser.parse_args()
 
-    # Check for deprecated argument usage
-    if "--yaml-config" in sys.argv:
-        warnings.warn(
-            "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
     log_line = ""
     if args.config:
         # if the user provided a config file, use it, even if template was specified
@@ -382,10 +402,10 @@ def main(args: Optional[argparse.Namespace] = None):
             raise ValueError(f"Template {args.template} does not exist")
         log_line = f"Using template {args.template} config file: {config_file}"
     else:
-        raise ValueError("Either --yaml-config or --template must be provided")
+        raise ValueError("Either --config or --template must be provided")
 
     logger_config = None
-    with open(config_file, "r") as fp:
+    with open(config_file) as fp:
         config_contents = yaml.safe_load(fp)
         if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
             logger_config = LoggingConfig(**cfg)
@@ -419,9 +439,49 @@ def main(args: Optional[argparse.Namespace] = None):
         app.add_middleware(ClientVersionMiddleware)
 
     # Add authentication middleware if configured
-    if config.server.auth and config.server.auth.endpoint:
-        logger.info(f"Enabling authentication with endpoint: {config.server.auth.endpoint}")
-        app.add_middleware(AuthenticationMiddleware, auth_endpoint=config.server.auth.endpoint)
+    if config.server.auth:
+        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
+        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
+    else:
+        if config.server.quota:
+            quota = config.server.quota
+            logger.warning(
+                "Configured authenticated_max_requests (%d) but no auth is enabled; "
+                "falling back to anonymous_max_requests (%d) for all the requests",
+                quota.authenticated_max_requests,
+                quota.anonymous_max_requests,
+            )
+
+    if config.server.quota:
+        logger.info("Enabling quota middleware for authenticated and anonymous clients")
+
+        quota = config.server.quota
+        anonymous_max_requests = quota.anonymous_max_requests
+        # if auth is disabled, use the anonymous max requests
+        authenticated_max_requests = quota.authenticated_max_requests if config.server.auth else anonymous_max_requests
+
+        kv_config = quota.kvstore
+        window_map = {"day": 86400}
+        window_seconds = window_map[quota.period.value]
+
+        app.add_middleware(
+            QuotaMiddleware,
+            kv_config=kv_config,
+            anonymous_max_requests=anonymous_max_requests,
+            authenticated_max_requests=authenticated_max_requests,
+            window_seconds=window_seconds,
+        )
+
+    # --- CORS middleware for local development ---
+    # TODO: move to reverse proxy
+    ui_port = os.environ.get("LLAMA_STACK_UI_PORT", 8322)
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=[f"http://localhost:{ui_port}"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
 
     try:
         impls = asyncio.run(construct_stack(config))
@@ -434,7 +494,7 @@ def main(args: Optional[argparse.Namespace] = None):
     else:
         setup_logger(TelemetryAdapter(TelemetryConfig(), {}))
 
-    all_endpoints = get_all_api_endpoints()
+    all_routes = get_all_api_routes()
 
     if config.apis:
         apis_to_serve = set(config.apis)
@@ -452,24 +512,29 @@ def main(args: Optional[argparse.Namespace] = None):
     for api_str in apis_to_serve:
         api = Api(api_str)
 
-        endpoints = all_endpoints[api]
+        routes = all_routes[api]
         impl = impls[api]
 
-        for endpoint in endpoints:
-            if not hasattr(impl, endpoint.name):
+        for route in routes:
+            if not hasattr(impl, route.name):
                 # ideally this should be a typing violation already
-                raise ValueError(f"Could not find method {endpoint.name} on {impl}!!")
+                raise ValueError(f"Could not find method {route.name} on {impl}!")
 
-            impl_method = getattr(impl, endpoint.name)
-            logger.debug(f"{endpoint.method.upper()} {endpoint.route}")
+            impl_method = getattr(impl, route.name)
+            # Filter out HEAD method since it's automatically handled by FastAPI for GET routes
+            available_methods = [m for m in route.methods if m != "HEAD"]
+            if not available_methods:
+                raise ValueError(f"No methods found for {route.name} on {impl}")
+            method = available_methods[0]
+            logger.debug(f"{method} {route.path}")
 
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._fields")
-                getattr(app, endpoint.method)(endpoint.route, response_model=None)(
+                getattr(app, method.lower())(route.path, response_model=None)(
                     create_dynamic_typed_route(
                         impl_method,
-                        endpoint.method,
-                        endpoint.route,
+                        method.lower(),
+                        route.path,
                     )
                 )
 
@@ -487,21 +552,24 @@ def main(args: Optional[argparse.Namespace] = None):
     port = args.port or config.server.port
 
     ssl_config = None
-    if args.tls_keyfile:
-        keyfile = args.tls_keyfile
-        certfile = args.tls_certfile
-    else:
-        keyfile = config.server.tls_keyfile
-        certfile = config.server.tls_certfile
+    keyfile = config.server.tls_keyfile
+    certfile = config.server.tls_certfile
 
     if keyfile and certfile:
         ssl_config = {
             "ssl_keyfile": keyfile,
             "ssl_certfile": certfile,
         }
-        logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+        if config.server.tls_cafile:
+            ssl_config["ssl_ca_certs"] = config.server.tls_cafile
+            ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
+        else:
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
-    listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
+    listen_host = config.server.host or ["::", "0.0.0.0"]
     logger.info(f"Listening on {listen_host}:{port}")
 
     uvicorn_config = {
@@ -517,7 +585,7 @@ def main(args: Optional[argparse.Namespace] = None):
     uvicorn.run(**uvicorn_config)
 
 
-def extract_path_params(route: str) -> List[str]:
+def extract_path_params(route: str) -> list[str]:
     segments = route.split("/")
     params = [seg[1:-1] for seg in segments if seg.startswith("{") and seg.endswith("}")]
     # to handle path params like {param:path}
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index a6dc3d2a0..fc68dc016 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -8,7 +8,7 @@ import importlib.resources
 import os
 import re
 import tempfile
-from typing import Any, Dict, Optional
+from typing import Any
 
 import yaml
 
@@ -90,7 +90,7 @@ RESOURCES = [
 ]
 
 
-async def register_resources(run_config: StackRunConfig, impls: Dict[Api, Any]):
+async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
     for rsrc, api, register_method, list_method in RESOURCES:
         objects = getattr(run_config, rsrc)
         if api not in impls:
@@ -197,7 +197,7 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
         ) from e
 
 
-def add_internal_implementations(impls: Dict[Api, Any], run_config: StackRunConfig) -> None:
+def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConfig) -> None:
     """Add internal implementations (inspect and providers) to the implementations dictionary.
 
     Args:
@@ -220,8 +220,8 @@ def add_internal_implementations(impls: Dict[Api, Any], run_config: StackRunConf
 # Produces a stack of providers for the given run config. Not all APIs may be
 # asked for in the run config.
 async def construct_stack(
-    run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None
-) -> Dict[Api, Any]:
+    run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None
+) -> dict[Api, Any]:
     dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
     impls = await resolve_impls(run_config, provider_registry or get_provider_registry(run_config), dist_registry)
 
@@ -244,7 +244,7 @@ def get_stack_run_config_from_template(template: str) -> StackRunConfig:
 
 
 def run_config_from_adhoc_config_spec(
-    adhoc_config_spec: str, provider_registry: Optional[ProviderRegistry] = None
+    adhoc_config_spec: str, provider_registry: ProviderRegistry | None = None
 ) -> StackRunConfig:
     """
     Create an adhoc distribution from a list of API providers.
diff --git a/llama_stack/distribution/start_stack.sh b/llama_stack/distribution/start_stack.sh
index d3e13c7dc..996935a5e 100755
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@@ -29,7 +29,7 @@ error_handler() {
 trap 'error_handler ${LINENO}' ERR
 
 if [ $# -lt 3 ]; then
-  echo "Usage: $0     "
+  echo "Usage: $0    [--config ] [--env KEY=VALUE]..."
   exit 1
 fi
 
@@ -40,37 +40,51 @@ env_path_or_name="$1"
 container_image="localhost/$env_path_or_name"
 shift
 
-yaml_config="$1"
-shift
-
 port="$1"
 shift
 
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"
 
-# Initialize env_vars as an string
+# Initialize variables
+yaml_config=""
 env_vars=""
 other_args=""
-# Process environment variables from --env arguments
+
+# Process remaining arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
-  --env)
-
-    if [[ -n "$2" ]]; then
-      env_vars="$env_vars --env $2"
-      shift 2
-    else
-      echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
-      exit 1
-    fi
-    ;;
-  *)
-    other_args="$other_args $1"
-    shift
-    ;;
+    --config)
+      if [[ -n "$2" ]]; then
+        yaml_config="$2"
+        shift 2
+      else
+        echo -e "${RED}Error: $1 requires a CONFIG argument${NC}" >&2
+        exit 1
+      fi
+      ;;
+    --env)
+      if [[ -n "$2" ]]; then
+        env_vars="$env_vars --env $2"
+        shift 2
+      else
+        echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
+        exit 1
+      fi
+      ;;
+    *)
+      other_args="$other_args $1"
+      shift
+      ;;
   esac
 done
+
+# Check if yaml_config is required based on env_type
+if [[ "$env_type" == "venv" || "$env_type" == "conda" ]] && [ -z "$yaml_config" ]; then
+  echo -e "${RED}Error: --config is required for venv and conda environments${NC}" >&2
+  exit 1
+fi
+
 PYTHON_BINARY="python"
 case "$env_type" in
   "venv")
@@ -106,8 +120,14 @@ esac
 if [[ "$env_type" == "venv" || "$env_type" == "conda" ]]; then
     set -x
 
+    if [ -n "$yaml_config" ]; then
+        yaml_config_arg="--config $yaml_config"
+    else
+        yaml_config_arg=""
+    fi
+
     $PYTHON_BINARY -m llama_stack.distribution.server.server \
-    --yaml-config "$yaml_config" \
+    $yaml_config_arg \
     --port "$port" \
     $env_vars \
     $other_args
@@ -149,15 +169,26 @@ elif [[ "$env_type" == "container" ]]; then
         version_tag=$(curl -s $URL | jq -r '.info.version')
     fi
 
-    $CONTAINER_BINARY run $CONTAINER_OPTS -it \
+    # Build the command with optional yaml config
+    cmd="$CONTAINER_BINARY run $CONTAINER_OPTS -it \
     -p $port:$port \
     $env_vars \
-    -v "$yaml_config:/app/config.yaml" \
     $mounts \
     --env LLAMA_STACK_PORT=$port \
     --entrypoint python \
     $container_image:$version_tag \
-    -m llama_stack.distribution.server.server \
-    --yaml-config /app/config.yaml \
-    $other_args
+    -m llama_stack.distribution.server.server"
+
+    # Add yaml config if provided, otherwise use default
+    if [ -n "$yaml_config" ]; then
+        cmd="$cmd -v $yaml_config:/app/run.yaml --config /app/run.yaml"
+    else
+        cmd="$cmd --config /app/run.yaml"
+    fi
+
+    # Add any other args
+    cmd="$cmd $other_args"
+
+    # Execute the command
+    eval $cmd
 fi
diff --git a/llama_stack/distribution/store/registry.py b/llama_stack/distribution/store/registry.py
index 76b66cc7a..0e84854c2 100644
--- a/llama_stack/distribution/store/registry.py
+++ b/llama_stack/distribution/store/registry.py
@@ -6,7 +6,7 @@
 
 import asyncio
 from contextlib import asynccontextmanager
-from typing import Dict, List, Optional, Protocol, Tuple
+from typing import Protocol
 
 import pydantic
 
@@ -20,13 +20,13 @@ logger = get_logger(__name__, category="core")
 
 
 class DistributionRegistry(Protocol):
-    async def get_all(self) -> List[RoutableObjectWithProvider]: ...
+    async def get_all(self) -> list[RoutableObjectWithProvider]: ...
 
     async def initialize(self) -> None: ...
 
-    async def get(self, identifier: str) -> Optional[RoutableObjectWithProvider]: ...
+    async def get(self, identifier: str) -> RoutableObjectWithProvider | None: ...
 
-    def get_cached(self, identifier: str) -> Optional[RoutableObjectWithProvider]: ...
+    def get_cached(self, identifier: str) -> RoutableObjectWithProvider | None: ...
 
     async def update(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider: ...
 
@@ -36,17 +36,17 @@ class DistributionRegistry(Protocol):
 
 
 REGISTER_PREFIX = "distributions:registry"
-KEY_VERSION = "v8"
+KEY_VERSION = "v9"
 KEY_FORMAT = f"{REGISTER_PREFIX}:{KEY_VERSION}::" + "{type}:{identifier}"
 
 
-def _get_registry_key_range() -> Tuple[str, str]:
+def _get_registry_key_range() -> tuple[str, str]:
     """Returns the start and end keys for the registry range query."""
     start_key = f"{REGISTER_PREFIX}:{KEY_VERSION}"
     return start_key, f"{start_key}\xff"
 
 
-def _parse_registry_values(values: List[str]) -> List[RoutableObjectWithProvider]:
+def _parse_registry_values(values: list[str]) -> list[RoutableObjectWithProvider]:
     """Utility function to parse registry values into RoutableObjectWithProvider objects."""
     all_objects = []
     for value in values:
@@ -67,16 +67,16 @@ class DiskDistributionRegistry(DistributionRegistry):
     async def initialize(self) -> None:
         pass
 
-    def get_cached(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
+    def get_cached(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
         # Disk registry does not have a cache
         raise NotImplementedError("Disk registry does not have a cache")
 
-    async def get_all(self) -> List[RoutableObjectWithProvider]:
+    async def get_all(self) -> list[RoutableObjectWithProvider]:
         start_key, end_key = _get_registry_key_range()
-        values = await self.kvstore.range(start_key, end_key)
+        values = await self.kvstore.values_in_range(start_key, end_key)
         return _parse_registry_values(values)
 
-    async def get(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
+    async def get(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
         json_str = await self.kvstore.get(KEY_FORMAT.format(type=type, identifier=identifier))
         if not json_str:
             return None
@@ -113,7 +113,7 @@ class DiskDistributionRegistry(DistributionRegistry):
 class CachedDiskDistributionRegistry(DiskDistributionRegistry):
     def __init__(self, kvstore: KVStore):
         super().__init__(kvstore)
-        self.cache: Dict[Tuple[str, str], RoutableObjectWithProvider] = {}
+        self.cache: dict[tuple[str, str], RoutableObjectWithProvider] = {}
         self._initialized = False
         self._initialize_lock = asyncio.Lock()
         self._cache_lock = asyncio.Lock()
@@ -134,7 +134,7 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
                 return
 
             start_key, end_key = _get_registry_key_range()
-            values = await self.kvstore.range(start_key, end_key)
+            values = await self.kvstore.values_in_range(start_key, end_key)
             objects = _parse_registry_values(values)
 
             async with self._locked_cache() as cache:
@@ -147,15 +147,15 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
     async def initialize(self) -> None:
         await self._ensure_initialized()
 
-    def get_cached(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
+    def get_cached(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
         return self.cache.get((type, identifier), None)
 
-    async def get_all(self) -> List[RoutableObjectWithProvider]:
+    async def get_all(self) -> list[RoutableObjectWithProvider]:
         await self._ensure_initialized()
         async with self._locked_cache() as cache:
             return list(cache.values())
 
-    async def get(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
+    async def get(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
         await self._ensure_initialized()
         cache_key = (type, identifier)
 
@@ -189,7 +189,7 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
 
 
 async def create_dist_registry(
-    metadata_store: Optional[KVStoreConfig],
+    metadata_store: KVStoreConfig | None,
     image_name: str,
 ) -> tuple[CachedDiskDistributionRegistry, KVStore]:
     # instantiate kvstore for storing and retrieving distribution metadata
diff --git a/llama_stack/distribution/ui/Containerfile b/llama_stack/distribution/ui/Containerfile
index 0126d1867..5d2dc933b 100644
--- a/llama_stack/distribution/ui/Containerfile
+++ b/llama_stack/distribution/ui/Containerfile
@@ -5,7 +5,8 @@ FROM python:3.12-slim
 WORKDIR /app
 COPY . /app/
 RUN /usr/local/bin/python -m pip install --upgrade pip && \
-    /usr/local/bin/pip3 install -r requirements.txt
+    /usr/local/bin/pip3 install -r requirements.txt && \
+    /usr/local/bin/pip3 install -r llama_stack/distribution/ui/requirements.txt
 EXPOSE 8501
 
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
+ENTRYPOINT ["streamlit", "run", "llama_stack/distribution/ui/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index 51c2d2bc2..0e96690ec 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -48,3 +48,6 @@ uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
 | TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
 | SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
 | OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
+| KEYCLOAK_URL               | URL for keycloak authentication    | (empty string)            |
+| KEYCLOAK_REALM             | Keycloak realm                     | default                   |
+| KEYCLOAK_CLIENT_ID         | Client ID for keycloak auth        | (empty string)            |
\ No newline at end of file
diff --git a/llama_stack/distribution/ui/app.py b/llama_stack/distribution/ui/app.py
index 441f65d20..a9a28b445 100644
--- a/llama_stack/distribution/ui/app.py
+++ b/llama_stack/distribution/ui/app.py
@@ -50,6 +50,42 @@ def main():
     )
     pg.run()
 
+def main2():
+    from dataclasses import asdict
+    st.subheader(f"Welcome {keycloak.user_info['preferred_username']}!")
+    st.write(f"Here is your user information:")
+    st.write(asdict(keycloak))
+
+def get_access_token() -> str|None:
+    return st.session_state.get('access_token')
 
 if __name__ == "__main__":
-    main()
+    
+    from streamlit_keycloak import login
+    import os
+    
+    keycloak_url = os.environ.get("KEYCLOAK_URL")
+    keycloak_realm = os.environ.get("KEYCLOAK_REALM", "default")
+    keycloak_client_id = os.environ.get("KEYCLOAK_CLIENT_ID")
+    
+    if keycloak_url and keycloak_client_id:
+        keycloak = login(
+            url=keycloak_url,
+            realm=keycloak_realm,
+            client_id=keycloak_client_id,
+            custom_labels={
+                "labelButton": "Sign in to kvant",
+                "labelLogin": "Please sign in to your kvant account.",
+                "errorNoPopup": "Unable to open the authentication popup. Allow popups and refresh the page to proceed.",
+                "errorPopupClosed": "Authentication popup was closed manually.",
+                "errorFatal": "Unable to connect to Keycloak using the current configuration."   
+            },
+            auto_refresh=True,
+        )
+
+        if keycloak.authenticated:
+            st.session_state['access_token'] = keycloak.access_token
+            main()
+    # TBD - add other authentications
+    else:
+        main()
diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py
index d5395c5b9..a426e59ba 100644
--- a/llama_stack/distribution/ui/modules/api.py
+++ b/llama_stack/distribution/ui/modules/api.py
@@ -5,14 +5,15 @@
 # the root directory of this source tree.
 
 import os
-from typing import Optional
 
 from llama_stack_client import LlamaStackClient
+from llama_stack.distribution.ui.app import get_access_token
 
 
 class LlamaStackApi:
     def __init__(self):
         self.client = LlamaStackClient(
+            api_key=get_access_token(),
             base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
             provider_data={
                 "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
@@ -23,11 +24,9 @@ class LlamaStackApi:
             },
         )
 
-    def run_scoring(self, row, scoring_function_ids: list[str], scoring_params: Optional[dict]):
+    def run_scoring(self, row, scoring_function_ids: list[str], scoring_params: dict | None):
         """Run scoring on a single row"""
         if not scoring_params:
             scoring_params = {fn_id: None for fn_id in scoring_function_ids}
         return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params)
 
-
-llama_stack_api = LlamaStackApi()
diff --git a/llama_stack/distribution/ui/page/distribution/datasets.py b/llama_stack/distribution/ui/page/distribution/datasets.py
index 6842b29a7..89f645ca8 100644
--- a/llama_stack/distribution/ui/page/distribution/datasets.py
+++ b/llama_stack/distribution/ui/page/distribution/datasets.py
@@ -6,13 +6,13 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def datasets():
     st.header("Datasets")
 
-    datasets_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list()}
+    datasets_info = {d.identifier: d.to_dict() for d in LlamaStackApi().client.datasets.list()}
     if len(datasets_info) > 0:
         selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys()))
         st.json(datasets_info[selected_dataset], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index 492be4700..2b70f9202 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -6,14 +6,14 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def benchmarks():
     # Benchmarks Section
     st.header("Benchmarks")
 
-    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
+    benchmarks_info = {d.identifier: d.to_dict() for d in LlamaStackApi().client.benchmarks.list()}
 
     if len(benchmarks_info) > 0:
         selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
diff --git a/llama_stack/distribution/ui/page/distribution/models.py b/llama_stack/distribution/ui/page/distribution/models.py
index f29459098..3b96f179f 100644
--- a/llama_stack/distribution/ui/page/distribution/models.py
+++ b/llama_stack/distribution/ui/page/distribution/models.py
@@ -6,13 +6,13 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def models():
     # Models Section
     st.header("Models")
-    models_info = {m.identifier: m.to_dict() for m in llama_stack_api.client.models.list()}
+    models_info = {m.identifier: m.to_dict() for m in LlamaStackApi().client.models.list()}
 
     selected_model = st.selectbox("Select a model", list(models_info.keys()))
     st.json(models_info[selected_model])
diff --git a/llama_stack/distribution/ui/page/distribution/providers.py b/llama_stack/distribution/ui/page/distribution/providers.py
index c660cb986..116237b13 100644
--- a/llama_stack/distribution/ui/page/distribution/providers.py
+++ b/llama_stack/distribution/ui/page/distribution/providers.py
@@ -6,12 +6,12 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def providers():
     st.header("🔍 API Providers")
-    apis_providers_lst = llama_stack_api.client.providers.list()
+    apis_providers_lst = LlamaStackApi().client.providers.list()
     api_to_providers = {}
     for api_provider in apis_providers_lst:
         if api_provider.api in api_to_providers:
diff --git a/llama_stack/distribution/ui/page/distribution/scoring_functions.py b/llama_stack/distribution/ui/page/distribution/scoring_functions.py
index 193146356..3c3428f44 100644
--- a/llama_stack/distribution/ui/page/distribution/scoring_functions.py
+++ b/llama_stack/distribution/ui/page/distribution/scoring_functions.py
@@ -6,13 +6,13 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def scoring_functions():
     st.header("Scoring Functions")
 
-    scoring_functions_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.scoring_functions.list()}
+    scoring_functions_info = {s.identifier: s.to_dict() for s in LlamaStackApi().client.scoring_functions.list()}
 
     selected_scoring_function = st.selectbox("Select a scoring function", list(scoring_functions_info.keys()))
     st.json(scoring_functions_info[selected_scoring_function], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/shields.py b/llama_stack/distribution/ui/page/distribution/shields.py
index 67d66d64f..84b583980 100644
--- a/llama_stack/distribution/ui/page/distribution/shields.py
+++ b/llama_stack/distribution/ui/page/distribution/shields.py
@@ -6,14 +6,14 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def shields():
     # Shields Section
     st.header("Shields")
 
-    shields_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list()}
+    shields_info = {s.identifier: s.to_dict() for s in LlamaStackApi().client.shields.list()}
 
     selected_shield = st.selectbox("Select a shield", list(shields_info.keys()))
     st.json(shields_info[selected_shield])
diff --git a/llama_stack/distribution/ui/page/distribution/vector_dbs.py b/llama_stack/distribution/ui/page/distribution/vector_dbs.py
index 49a4f25bb..e7eb7b13b 100644
--- a/llama_stack/distribution/ui/page/distribution/vector_dbs.py
+++ b/llama_stack/distribution/ui/page/distribution/vector_dbs.py
@@ -6,12 +6,12 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def vector_dbs():
     st.header("Vector Databases")
-    vector_dbs_info = {v.identifier: v.to_dict() for v in llama_stack_api.client.vector_dbs.list()}
+    vector_dbs_info = {v.identifier: v.to_dict() for v in LlamaStackApi().client.vector_dbs.list()}
 
     if len(vector_dbs_info) > 0:
         selected_vector_db = st.selectbox("Select a vector database", list(vector_dbs_info.keys()))
diff --git a/llama_stack/distribution/ui/page/evaluations/app_eval.py b/llama_stack/distribution/ui/page/evaluations/app_eval.py
index d7bc6388c..13da6071e 100644
--- a/llama_stack/distribution/ui/page/evaluations/app_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/app_eval.py
@@ -9,7 +9,7 @@ import json
 import pandas as pd
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 from llama_stack.distribution.ui.modules.utils import process_dataset
 
 
@@ -39,7 +39,7 @@ def application_evaluation_page():
 
     # Select Scoring Functions to Run Evaluation On
     st.subheader("Select Scoring Functions")
-    scoring_functions = llama_stack_api.client.scoring_functions.list()
+    scoring_functions = LlamaStackApi().client.scoring_functions.list()
     scoring_functions = {sf.identifier: sf for sf in scoring_functions}
     scoring_functions_names = list(scoring_functions.keys())
     selected_scoring_functions = st.multiselect(
@@ -48,7 +48,7 @@ def application_evaluation_page():
         help="Choose one or more scoring functions.",
     )
 
-    available_models = llama_stack_api.client.models.list()
+    available_models = LlamaStackApi().client.models.list()
     available_models = [m.identifier for m in available_models]
 
     scoring_params = {}
@@ -108,7 +108,7 @@ def application_evaluation_page():
                 progress_bar.progress(progress, text=progress_text)
 
                 # Run evaluation for current row
-                score_res = llama_stack_api.run_scoring(
+                score_res = LlamaStackApi().run_scoring(
                     r,
                     scoring_function_ids=selected_scoring_functions,
                     scoring_params=scoring_params,
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 97f875e17..133c3b151 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -9,13 +9,13 @@ import json
 import pandas as pd
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 def select_benchmark_1():
     # Select Benchmarks
     st.subheader("1. Choose An Eval Task")
-    benchmarks = llama_stack_api.client.benchmarks.list()
+    benchmarks = LlamaStackApi().client.benchmarks.list()
     benchmarks = {et.identifier: et for et in benchmarks}
     benchmarks_names = list(benchmarks.keys())
     selected_benchmark = st.selectbox(
@@ -47,7 +47,7 @@ def define_eval_candidate_2():
         # Define Eval Candidate
         candidate_type = st.radio("Candidate Type", ["model", "agent"])
 
-        available_models = llama_stack_api.client.models.list()
+        available_models = LlamaStackApi().client.models.list()
         available_models = [model.identifier for model in available_models]
         selected_model = st.selectbox(
             "Choose a model",
@@ -167,7 +167,7 @@ def run_evaluation_3():
     eval_candidate = st.session_state["eval_candidate"]
 
     dataset_id = benchmarks[selected_benchmark].dataset_id
-    rows = llama_stack_api.client.datasets.iterrows(
+    rows = LlamaStackApi().client.datasets.iterrows(
         dataset_id=dataset_id,
     )
     total_rows = len(rows.data)
@@ -208,7 +208,7 @@ def run_evaluation_3():
             progress = i / len(rows)
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
-            eval_res = llama_stack_api.client.eval.evaluate_rows(
+            eval_res = LlamaStackApi().client.eval.evaluate_rows(
                 benchmark_id=selected_benchmark,
                 input_rows=[r],
                 scoring_functions=benchmarks[selected_benchmark].scoring_functions,
diff --git a/llama_stack/distribution/ui/page/playground/chat.py b/llama_stack/distribution/ui/page/playground/chat.py
index 8e7345169..053ae42de 100644
--- a/llama_stack/distribution/ui/page/playground/chat.py
+++ b/llama_stack/distribution/ui/page/playground/chat.py
@@ -6,12 +6,12 @@
 
 import streamlit as st
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 # Sidebar configurations
 with st.sidebar:
     st.header("Configuration")
-    available_models = llama_stack_api.client.models.list()
+    available_models = LlamaStackApi().client.models.list()
     available_models = [model.identifier for model in available_models if model.model_type == "llm"]
     selected_model = st.selectbox(
         "Choose a model",
@@ -103,7 +103,7 @@ if prompt := st.chat_input("Example: What is Llama Stack?"):
         else:
             strategy = {"type": "greedy"}
 
-        response = llama_stack_api.client.inference.chat_completion(
+        response = LlamaStackApi().client.inference.chat_completion(
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": prompt},
@@ -124,7 +124,7 @@ if prompt := st.chat_input("Example: What is Llama Stack?"):
                 message_placeholder.markdown(full_response + "▌")
             message_placeholder.markdown(full_response)
         else:
-            full_response = response
-            message_placeholder.markdown(full_response.completion_message.content)
+            full_response = response.completion_message.content
+            message_placeholder.markdown(full_response)
 
         st.session_state.messages.append({"role": "assistant", "content": full_response})
diff --git a/llama_stack/distribution/ui/page/playground/rag.py b/llama_stack/distribution/ui/page/playground/rag.py
index 696d89bc2..94e27a255 100644
--- a/llama_stack/distribution/ui/page/playground/rag.py
+++ b/llama_stack/distribution/ui/page/playground/rag.py
@@ -10,7 +10,7 @@ import streamlit as st
 from llama_stack_client import Agent, AgentEventLogger, RAGDocument
 
 from llama_stack.apis.common.content_types import ToolCallDelta
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 from llama_stack.distribution.ui.modules.utils import data_url_from_file
 
 
@@ -57,14 +57,14 @@ def rag_chat_page():
                     for i, uploaded_file in enumerate(uploaded_files)
                 ]
 
-                providers = llama_stack_api.client.providers.list()
+                providers = LlamaStackApi().client.providers.list()
                 vector_io_provider = None
 
                 for x in providers:
                     if x.api == "vector_io":
                         vector_io_provider = x.provider_id
 
-                llama_stack_api.client.vector_dbs.register(
+                LlamaStackApi().client.vector_dbs.register(
                     vector_db_id=vector_db_name,  # Use the user-provided name
                     embedding_dimension=384,
                     embedding_model="all-MiniLM-L6-v2",
@@ -72,7 +72,7 @@ def rag_chat_page():
                 )
 
                 # insert documents using the custom vector db name
-                llama_stack_api.client.tool_runtime.rag_tool.insert(
+                LlamaStackApi().client.tool_runtime.rag_tool.insert(
                     vector_db_id=vector_db_name,  # Use the user-provided name
                     documents=documents,
                     chunk_size_in_tokens=512,
@@ -93,7 +93,7 @@ def rag_chat_page():
         )
 
         # select memory banks
-        vector_dbs = llama_stack_api.client.vector_dbs.list()
+        vector_dbs = LlamaStackApi().client.vector_dbs.list()
         vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
         selected_vector_dbs = st.multiselect(
             label="Select Document Collections to use in RAG queries",
@@ -103,7 +103,7 @@ def rag_chat_page():
         )
 
         st.subheader("Inference Parameters", divider=True)
-        available_models = llama_stack_api.client.models.list()
+        available_models = LlamaStackApi().client.models.list()
         available_models = [model.identifier for model in available_models if model.model_type == "llm"]
         selected_model = st.selectbox(
             label="Choose a model",
@@ -167,7 +167,7 @@ def rag_chat_page():
     @st.cache_resource
     def create_agent():
         return Agent(
-            llama_stack_api.client,
+            LlamaStackApi().client,
             model=selected_model,
             instructions=system_prompt,
             sampling_params={
@@ -232,7 +232,7 @@ def rag_chat_page():
             st.session_state.messages.append({"role": "system", "content": system_prompt})
 
         # Query the vector DB
-        rag_response = llama_stack_api.client.tool_runtime.rag_tool.query(
+        rag_response = LlamaStackApi().client.tool_runtime.rag_tool.query(
             content=prompt, vector_db_ids=list(selected_vector_dbs)
         )
         prompt_context = rag_response.content
@@ -251,7 +251,7 @@ def rag_chat_page():
 
             # Run inference directly
             st.session_state.messages.append({"role": "user", "content": extended_prompt})
-            response = llama_stack_api.client.inference.chat_completion(
+            response = LlamaStackApi().client.inference.chat_completion(
                 messages=st.session_state.messages,
                 model_id=selected_model,
                 sampling_params={
diff --git a/llama_stack/distribution/ui/page/playground/tools.py b/llama_stack/distribution/ui/page/playground/tools.py
index 6c6a9fcfd..570bfb366 100644
--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@@ -13,7 +13,7 @@ from llama_stack_client import Agent
 from llama_stack_client.lib.agents.react.agent import ReActAgent
 from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
 
-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.distribution.ui.modules.api import LlamaStackApi
 
 
 class AgentType(enum.Enum):
@@ -24,7 +24,7 @@ class AgentType(enum.Enum):
 def tool_chat_page():
     st.title("🛠 Tools")
 
-    client = llama_stack_api.client
+    client = LlamaStackApi().client
     models = client.models.list()
     model_list = [model.identifier for model in models if model.api_model_type == "llm"]
 
@@ -55,7 +55,7 @@ def tool_chat_page():
         )
 
         if "builtin::rag" in toolgroup_selection:
-            vector_dbs = llama_stack_api.client.vector_dbs.list() or []
+            vector_dbs = LlamaStackApi().client.vector_dbs.list() or []
             if not vector_dbs:
                 st.info("No vector databases available for selection.")
             vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
@@ -94,12 +94,16 @@ def tool_chat_page():
         st.subheader("Agent Configurations")
         st.subheader("Agent Type")
         agent_type = st.radio(
-            "Select Agent Type",
-            [AgentType.REGULAR, AgentType.REACT],
-            format_func=lambda x: x.value,
+            label="Select Agent Type",
+            options=["Regular", "ReAct"],
             on_change=reset_agent,
         )
 
+        if agent_type == "ReAct":
+            agent_type = AgentType.REACT
+        else:
+            agent_type = AgentType.REGULAR
+
         max_tokens = st.slider(
             "Max Tokens",
             min_value=0,
diff --git a/llama_stack/distribution/ui/requirements.txt b/llama_stack/distribution/ui/requirements.txt
index 61d42768d..862f969d6 100644
--- a/llama_stack/distribution/ui/requirements.txt
+++ b/llama_stack/distribution/ui/requirements.txt
@@ -1,5 +1,5 @@
-streamlit
+llama-stack-client>=0.2.9
 pandas
-llama-stack-client>=0.2.1
+streamlit
 streamlit-option-menu
-llama-stack>=0.2.1
+streamlit-keycloak
diff --git a/llama_stack/distribution/utils/config.py b/llama_stack/distribution/utils/config.py
index 5e78289b7..dece52460 100644
--- a/llama_stack/distribution/utils/config.py
+++ b/llama_stack/distribution/utils/config.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 
-def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
+def redact_sensitive_fields(data: dict[str, Any]) -> dict[str, Any]:
     """Redact sensitive information from config before printing."""
     sensitive_patterns = ["api_key", "api_token", "password", "secret"]
 
@@ -18,7 +18,7 @@ def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
             return [_redact_value(i) for i in v]
         return v
 
-    def _redact_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    def _redact_dict(d: dict[str, Any]) -> dict[str, Any]:
         result = {}
         for k, v in d.items():
             if any(pattern in k.lower() for pattern in sensitive_patterns):
diff --git a/llama_stack/distribution/utils/config_dirs.py b/llama_stack/distribution/utils/config_dirs.py
index 9b9a7ceb3..c3e520f28 100644
--- a/llama_stack/distribution/utils/config_dirs.py
+++ b/llama_stack/distribution/utils/config_dirs.py
@@ -14,3 +14,5 @@ DISTRIBS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "distributions"
 DEFAULT_CHECKPOINT_DIR = LLAMA_STACK_CONFIG_DIR / "checkpoints"
 
 RUNTIME_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "runtime"
+
+EXTERNAL_PROVIDERS_DIR = LLAMA_STACK_CONFIG_DIR / "providers.d"
diff --git a/llama_stack/distribution/utils/context.py b/llama_stack/distribution/utils/context.py
index c34079ac6..3fcd3315f 100644
--- a/llama_stack/distribution/utils/context.py
+++ b/llama_stack/distribution/utils/context.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from collections.abc import AsyncGenerator
 from contextvars import ContextVar
-from typing import AsyncGenerator, List, TypeVar
+from typing import TypeVar
 
 T = TypeVar("T")
 
 
 def preserve_contexts_async_generator(
-    gen: AsyncGenerator[T, None], context_vars: List[ContextVar]
+    gen: AsyncGenerator[T, None], context_vars: list[ContextVar]
 ) -> AsyncGenerator[T, None]:
     """
     Wraps an async generator to preserve context variables across iterations.
diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py
index 3bf3c81ce..7c2e00524 100644
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@@ -8,6 +8,7 @@ import logging
 import os
 import signal
 import subprocess
+import sys
 
 from termcolor import cprint
 
@@ -22,8 +23,10 @@ from llama_stack.distribution.utils.image_types import LlamaStackImageType
 
 def formulate_run_args(image_type, image_name, config, template_name) -> list:
     env_name = ""
-    if image_type == LlamaStackImageType.CONTAINER.value or config.container_image:
-        env_name = f"distribution-{template_name}" if template_name else config.container_image
+    if image_type == LlamaStackImageType.CONTAINER.value:
+        env_name = (
+            f"distribution-{template_name}" if template_name else (config.container_image if config else image_name)
+        )
     elif image_type == LlamaStackImageType.CONDA.value:
         current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
         env_name = image_name or current_conda_env
@@ -31,6 +34,7 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
             cprint(
                 "No current conda environment detected, please specify a conda environment name with --image-name",
                 color="red",
+                file=sys.stderr,
             )
             return
 
@@ -47,12 +51,13 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
                     return envpath
             return None
 
-        print(f"Using conda environment: {env_name}")
+        cprint(f"Using conda environment: {env_name}", color="green", file=sys.stderr)
         conda_prefix = get_conda_prefix(env_name)
         if not conda_prefix:
             cprint(
                 f"Conda environment {env_name} does not exist.",
                 color="red",
+                file=sys.stderr,
             )
             return
 
@@ -61,6 +66,7 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
             cprint(
                 f"Build file {build_file} does not exist.\n\nPlease run `llama stack build` or specify the correct conda environment name with --image-name",
                 color="red",
+                file=sys.stderr,
             )
             return
     else:
@@ -71,9 +77,10 @@ def formulate_run_args(image_type, image_name, config, template_name) -> list:
             cprint(
                 "No current virtual environment detected, please specify a virtual environment name with --image-name",
                 color="red",
+                file=sys.stderr,
             )
             return
-        print(f"Using virtual environment: {env_name}")
+        cprint(f"Using virtual environment: {env_name}", file=sys.stderr)
 
     script = importlib.resources.files("llama_stack") / "distribution/start_stack.sh"
     run_args = [
diff --git a/llama_stack/distribution/utils/prompt_for_config.py b/llama_stack/distribution/utils/prompt_for_config.py
index 9b2b99022..26f6920e0 100644
--- a/llama_stack/distribution/utils/prompt_for_config.py
+++ b/llama_stack/distribution/utils/prompt_for_config.py
@@ -8,12 +8,11 @@ import inspect
 import json
 import logging
 from enum import Enum
-from typing import Any, List, Literal, Optional, Type, Union, get_args, get_origin
+from typing import Annotated, Any, Literal, Union, get_args, get_origin
 
 from pydantic import BaseModel
 from pydantic.fields import FieldInfo
 from pydantic_core import PydanticUndefinedType
-from typing_extensions import Annotated
 
 log = logging.getLogger(__name__)
 
@@ -21,7 +20,7 @@ log = logging.getLogger(__name__)
 def is_list_of_primitives(field_type):
     """Check if a field type is a List of primitive types."""
     origin = get_origin(field_type)
-    if origin is List or origin is list:
+    if origin is list or origin is list:
         args = get_args(field_type)
         if len(args) == 1 and args[0] in (int, float, str, bool):
             return True
@@ -53,7 +52,7 @@ def get_non_none_type(field_type):
     return next(arg for arg in get_args(field_type) if arg is not type(None))
 
 
-def manually_validate_field(model: Type[BaseModel], field_name: str, value: Any):
+def manually_validate_field(model: type[BaseModel], field_name: str, value: Any):
     validators = model.__pydantic_decorators__.field_validators
     for _name, validator in validators.items():
         if field_name in validator.info.fields:
@@ -126,7 +125,7 @@ def prompt_for_discriminated_union(
 #
 # doesn't support List[nested_class] yet or Dicts of any kind. needs a bunch of
 # unit tests for coverage.
-def prompt_for_config(config_type: type[BaseModel], existing_config: Optional[BaseModel] = None) -> BaseModel:
+def prompt_for_config(config_type: type[BaseModel], existing_config: BaseModel | None = None) -> BaseModel:
     """
     Recursively prompt the user for configuration values based on a Pydantic BaseModel.
 
diff --git a/llama_stack/log.py b/llama_stack/log.py
index 3835b74a1..f4184710a 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -6,8 +6,8 @@
 
 import logging
 import os
+import sys
 from logging.config import dictConfig
-from typing import Dict, Optional
 
 from rich.console import Console
 from rich.errors import MarkupError
@@ -33,7 +33,7 @@ CATEGORIES = [
 ]
 
 # Initialize category levels with default level
-_category_levels: Dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES}
+_category_levels: dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES}
 
 
 def config_to_category_levels(category: str, level: str):
@@ -49,7 +49,7 @@ def config_to_category_levels(category: str, level: str):
         Dict[str, int]: A dictionary mapping categories to their log levels.
     """
 
-    category_levels: Dict[str, int] = {}
+    category_levels: dict[str, int] = {}
     level_value = logging._nameToLevel.get(str(level).upper())
     if level_value is None:
         logging.warning(f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'.")
@@ -69,7 +69,7 @@ def config_to_category_levels(category: str, level: str):
     return category_levels
 
 
-def parse_yaml_config(yaml_config: LoggingConfig) -> Dict[str, int]:
+def parse_yaml_config(yaml_config: LoggingConfig) -> dict[str, int]:
     """
     Helper function to parse a yaml logging configuration found in the run.yaml
 
@@ -86,7 +86,7 @@ def parse_yaml_config(yaml_config: LoggingConfig) -> Dict[str, int]:
     return category_levels
 
 
-def parse_environment_config(env_config: str) -> Dict[str, int]:
+def parse_environment_config(env_config: str) -> dict[str, int]:
     """
     Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels.
 
@@ -131,7 +131,7 @@ class CustomRichHandler(RichHandler):
                 self.markup = original_markup
 
 
-def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None:
+def setup_logging(category_levels: dict[str, int], log_file: str | None) -> None:
     """
     Configure logging based on the provided category log levels and an optional log file.
 
@@ -211,7 +211,7 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
 
 
 def get_logger(
-    name: str, category: str = "uncategorized", config: Optional[LoggingConfig] | None = None
+    name: str, category: str = "uncategorized", config: LoggingConfig | None | None = None
 ) -> logging.LoggerAdapter:
     """
     Returns a logger with the specified name and category.
@@ -235,7 +235,7 @@ def get_logger(
 
 env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
 if env_config:
-    cprint(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}", "yellow")
+    cprint(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}", color="yellow", file=sys.stderr)
     _category_levels.update(parse_environment_config(env_config))
 
 log_file = os.environ.get("LLAMA_STACK_LOG_FILE")
diff --git a/llama_stack/models/llama/checkpoint.py b/llama_stack/models/llama/checkpoint.py
index 2bae08a69..c9e0030e3 100644
--- a/llama_stack/models/llama/checkpoint.py
+++ b/llama_stack/models/llama/checkpoint.py
@@ -7,14 +7,14 @@
 import concurrent.futures
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
 from fairscale.nn.model_parallel.initialize import get_model_parallel_rank, get_model_parallel_world_size
 
 
-def map_mp_rank(old_mp_size: int, new_mp_size: int, new_mp_rank: int) -> List[int]:
+def map_mp_rank(old_mp_size: int, new_mp_size: int, new_mp_rank: int) -> list[int]:
     """Map a new MP rank to a list of old MP ranks given a change in MP size."""
     if new_mp_size % old_mp_size == 0:
         # Read old MP shard and split it into smaller ones
@@ -31,12 +31,12 @@ def map_mp_rank(old_mp_size: int, new_mp_size: int, new_mp_rank: int) -> List[in
 
 
 def maybe_reshard_state_dict(
-    ckpt_paths: List[Path],
+    ckpt_paths: list[Path],
     n_kv_heads: int,
-    moe_num_experts: Optional[int] = None,
-    map_location: Union[str, torch.device] = "cpu",
+    moe_num_experts: int | None = None,
+    map_location: str | torch.device = "cpu",
     mmap: bool = True,
-) -> Dict[str, torch.Tensor]:
+) -> dict[str, torch.Tensor]:
     if str(map_location) == "cpu":
         torch.set_default_tensor_type(torch.BFloat16Tensor)
     else:
@@ -97,18 +97,18 @@ _MOE_WEIGHT_COLUMN_KEY = {"feed_forward.experts.moe_w_out_eF_D"}
 
 
 def reshard_mp(
-    state_dicts: List[Dict[str, torch.Tensor]],
+    state_dicts: list[dict[str, torch.Tensor]],
     size: int,
     rank: int,
     repeat_qk_qv: int = 1,
-) -> Dict[str, torch.Tensor]:
+) -> dict[str, torch.Tensor]:
     """
     Reshard a list of state dicts into a single state dict given a change in MP size.
     If the list has more than one state dict, we concatenate the values of the same
     key across all state dicts. Otherwise, we just slice it for the current MP rank.
     """
 
-    def concat_or_chunk(tensors: List[torch.Tensor], dim: int) -> torch.Tensor:
+    def concat_or_chunk(tensors: list[torch.Tensor], dim: int) -> torch.Tensor:
         if len(tensors) > 1:
             return torch.cat(tensors, dim=dim)
         return tensors[0].chunk(size, dim=dim)[rank].clone()
@@ -144,7 +144,7 @@ def reshard_mp(
     column_regex = re.compile("|".join(column_keys))
     row_regex = re.compile("|".join(row_keys))
 
-    output: Dict[str, torch.Tensor] = {}
+    output: dict[str, torch.Tensor] = {}
     with concurrent.futures.ThreadPoolExecutor() as executor:
         # Note: only processes keys in the first state dict.
         # Assumes keys are the same across all state dicts.
@@ -154,7 +154,7 @@ def reshard_mp(
     return output
 
 
-def convert_moe_weights(state_dict: Dict[str, Any], num_experts: int) -> Dict[str, Any]:
+def convert_moe_weights(state_dict: dict[str, Any], num_experts: int) -> dict[str, Any]:
     routed_keys = _MOE_WEIGHT_ROW_KEY | _MOE_WEIGHT_COLUMN_KEY
     routed_regex = re.compile("|".join(routed_keys))
     keys = list(state_dict.keys())
diff --git a/llama_stack/models/llama/datatypes.py b/llama_stack/models/llama/datatypes.py
index 48cb51005..f9f094c3d 100644
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@@ -7,10 +7,9 @@
 import base64
 from enum import Enum
 from io import BytesIO
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
-from typing_extensions import Annotated
 
 # The goal is that these set of types are relevant for all Llama models.
 # That isn't the current state yet -- e.g., BuiltinTool is somewhat specific to
@@ -31,21 +30,21 @@ class BuiltinTool(Enum):
     code_interpreter = "code_interpreter"
 
 
-Primitive = Union[str, int, float, bool, None]
-RecursiveType = Union[Primitive, List[Primitive], Dict[str, Primitive]]
+Primitive = str | int | float | bool | None
+RecursiveType = Primitive | list[Primitive] | dict[str, Primitive]
 
 
 class ToolCall(BaseModel):
     call_id: str
-    tool_name: Union[BuiltinTool, str]
+    tool_name: BuiltinTool | str
     # Plan is to deprecate the Dict in favor of a JSON string
     # that is parsed on the client side instead of trying to manage
     # the recursive type here.
     # Making this a union so that client side can start prepping for this change.
     # Eventually, we will remove both the Dict and arguments_json field,
     # and arguments will just be a str
-    arguments: Union[str, Dict[str, RecursiveType]]
-    arguments_json: Optional[str] = None
+    arguments: str | dict[str, RecursiveType]
+    arguments_json: str | None = None
 
     @field_validator("tool_name", mode="before")
     @classmethod
@@ -91,15 +90,15 @@ class StopReason(Enum):
 
 class ToolParamDefinition(BaseModel):
     param_type: str
-    description: Optional[str] = None
-    required: Optional[bool] = True
-    default: Optional[Any] = None
+    description: str | None = None
+    required: bool | None = True
+    default: Any | None = None
 
 
 class ToolDefinition(BaseModel):
-    tool_name: Union[BuiltinTool, str]
-    description: Optional[str] = None
-    parameters: Optional[Dict[str, ToolParamDefinition]] = None
+    tool_name: BuiltinTool | str
+    description: str | None = None
+    parameters: dict[str, ToolParamDefinition] | None = None
 
     @field_validator("tool_name", mode="before")
     @classmethod
@@ -119,7 +118,7 @@ class RawMediaItem(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     @field_serializer("data")
-    def serialize_data(self, data: Optional[bytes], _info):
+    def serialize_data(self, data: bytes | None, _info):
         if data is None:
             return None
         return base64.b64encode(data).decode("utf-8")
@@ -137,9 +136,9 @@ class RawTextItem(BaseModel):
     text: str
 
 
-RawContentItem = Annotated[Union[RawTextItem, RawMediaItem], Field(discriminator="type")]
+RawContentItem = Annotated[RawTextItem | RawMediaItem, Field(discriminator="type")]
 
-RawContent = str | RawContentItem | List[RawContentItem]
+RawContent = str | RawContentItem | list[RawContentItem]
 
 
 class RawMessage(BaseModel):
@@ -147,17 +146,17 @@ class RawMessage(BaseModel):
     content: RawContent
 
     # This is for RAG but likely should be absorbed into content
-    context: Optional[RawContent] = None
+    context: RawContent | None = None
 
     # These are for the output message coming from the assistant
-    stop_reason: Optional[StopReason] = None
-    tool_calls: List[ToolCall] = Field(default_factory=list)
+    stop_reason: StopReason | None = None
+    tool_calls: list[ToolCall] = Field(default_factory=list)
 
 
 class GenerationResult(BaseModel):
     token: int
     text: str
-    logprobs: Optional[List[float]] = None
+    logprobs: list[float] | None = None
 
     source: Literal["input"] | Literal["output"]
 
diff --git a/llama_stack/models/llama/llama3/args.py b/llama_stack/models/llama/llama3/args.py
index f7e4b4557..4f92874f5 100644
--- a/llama_stack/models/llama/llama3/args.py
+++ b/llama_stack/models/llama/llama3/args.py
@@ -6,7 +6,6 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional
 
 
 class QuantizationScheme(Enum):
@@ -15,8 +14,8 @@ class QuantizationScheme(Enum):
 
 @dataclass
 class QuantizationArgs:
-    scheme: Optional[QuantizationScheme] = None
-    group_size: Optional[int] = None
+    scheme: QuantizationScheme | None = None
+    group_size: int | None = None
     spinquant: bool = False
 
     def __init__(self, **kwargs):
@@ -39,10 +38,10 @@ class ModelArgs:
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
-    n_kv_heads: Optional[int] = None
+    n_kv_heads: int | None = None
     vocab_size: int = -1
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
+    ffn_dim_multiplier: float | None = None
     norm_eps: float = 1e-5
     rope_theta: float = 500000
     use_scaled_rope: bool = False
@@ -55,8 +54,8 @@ class ModelArgs:
     vision_max_num_chunks: int = 4
     vision_num_cross_attention_layers: int = -1
 
-    quantization_args: Optional[QuantizationArgs] = None
-    lora_args: Optional[LoRAArgs] = None
+    quantization_args: QuantizationArgs | None = None
+    lora_args: LoRAArgs | None = None
 
     def __init__(self, **kwargs):
         for k, v in kwargs.items():
diff --git a/llama_stack/models/llama/llama3/chat_format.py b/llama_stack/models/llama/llama3/chat_format.py
index fe7a7a898..7bb05d8db 100644
--- a/llama_stack/models/llama/llama3/chat_format.py
+++ b/llama_stack/models/llama/llama3/chat_format.py
@@ -8,7 +8,6 @@ import io
 import json
 import uuid
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
 
 from PIL import Image as PIL_Image
 
@@ -29,14 +28,14 @@ from .tool_utils import ToolUtils
 
 @dataclass
 class VisionInput:
-    mask: List[List[int]]
-    images: List[PIL_Image.Image]
+    mask: list[list[int]]
+    images: list[PIL_Image.Image]
 
 
 @dataclass
 class LLMInput:
-    tokens: List[int]
-    vision: Optional[VisionInput] = None
+    tokens: list[int]
+    vision: VisionInput | None = None
 
 
 def role_str(role: Role) -> str:
@@ -50,7 +49,7 @@ def role_str(role: Role) -> str:
 
 
 class ChatFormat:
-    possible_headers: Dict[Role, str]
+    possible_headers: dict[Role, str]
 
     def __init__(self, tokenizer: Tokenizer):
         self.tokenizer = tokenizer
@@ -58,7 +57,7 @@ class ChatFormat:
         self.possible_headers = {role: f"<|start_header_id|>{role_str(role)}<|end_header_id|>\n\n" for role in Role}
         self.vision_token = self.tokenizer.special_tokens["<|image|>"]
 
-    def _encode_header(self, role: str) -> List[int]:
+    def _encode_header(self, role: str) -> list[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
         tokens.extend(self.tokenizer.encode("ipython" if role == "tool" else role, bos=False, eos=False))
@@ -70,7 +69,7 @@ class ChatFormat:
         tokens, images = self._encode_content(content, bos=True)
         return self._model_input_from_tokens_images(tokens, images)
 
-    def _encode_content(self, content: RawContent, bos: bool = False) -> Tuple[List[int], List[PIL_Image.Image]]:
+    def _encode_content(self, content: RawContent, bos: bool = False) -> tuple[list[int], list[PIL_Image.Image]]:
         tokens = []
         images = []
 
@@ -107,7 +106,7 @@ class ChatFormat:
 
     def encode_message(
         self, message: RawMessage, tool_prompt_format: ToolPromptFormat
-    ) -> Tuple[List[int], List[PIL_Image.Image]]:
+    ) -> tuple[list[int], list[PIL_Image.Image]]:
         tokens = self._encode_header(message.role)
         images = []
 
@@ -145,8 +144,8 @@ class ChatFormat:
 
     def encode_dialog_prompt(
         self,
-        messages: List[RawMessage],
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        messages: list[RawMessage],
+        tool_prompt_format: ToolPromptFormat | None = None,
     ) -> LLMInput:
         tool_prompt_format = tool_prompt_format or ToolPromptFormat.json
         tokens = []
@@ -163,7 +162,7 @@ class ChatFormat:
         return self._model_input_from_tokens_images(tokens, images)
 
     # TODO(this should be generic, not only for assistant messages)
-    def decode_assistant_message(self, tokens: List[int], stop_reason: StopReason) -> RawMessage:
+    def decode_assistant_message(self, tokens: list[int], stop_reason: StopReason) -> RawMessage:
         content = self.tokenizer.decode(tokens)
 
         return self.decode_assistant_message_from_content(content, stop_reason)
@@ -234,7 +233,7 @@ class ChatFormat:
             tool_calls=tool_calls,
         )
 
-    def _model_input_from_tokens_images(self, tokens: List[int], images: List[PIL_Image.Image]) -> LLMInput:
+    def _model_input_from_tokens_images(self, tokens: list[int], images: list[PIL_Image.Image]) -> LLMInput:
         vision_input = None
         if len(images) > 0:
             vision_input = VisionInput(
@@ -249,9 +248,9 @@ class ChatFormat:
 
 
 def create_vision_mask(
-    tokens: List[int],
+    tokens: list[int],
     vision_token: int,
-) -> List[List[int]]:
+) -> list[list[int]]:
     vision_token_locations = [i for i, token in enumerate(tokens) if token == vision_token]
     if len(vision_token_locations) == 0:
         return []
diff --git a/llama_stack/models/llama/llama3/generation.py b/llama_stack/models/llama/llama3/generation.py
index 35c140707..fe7be5ea9 100644
--- a/llama_stack/models/llama/llama3/generation.py
+++ b/llama_stack/models/llama/llama3/generation.py
@@ -15,8 +15,8 @@ import json
 import os
 import sys
 import time
+from collections.abc import Callable, Generator
 from pathlib import Path
-from typing import Callable, Generator, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -41,8 +41,8 @@ class Llama3:
         ckpt_dir: str,
         max_seq_len: int,
         max_batch_size: int,
-        world_size: Optional[int] = None,
-        quantization_mode: Optional[QuantizationMode] = None,
+        world_size: int | None = None,
+        quantization_mode: QuantizationMode | None = None,
         seed: int = 1,
         device: str = "cuda",
     ):
@@ -82,7 +82,7 @@ class Llama3:
         ckpt_paths = sorted(Path(ckpt_dir).glob("*.pth"))
         assert len(ckpt_paths) > 0, f"no checkpoint files found in {ckpt_dir}"
         print(f"Loading a checkpoint (shards={len(ckpt_paths)}, current-mp-size={world_size})")
-        with open(Path(ckpt_dir) / "params.json", "r") as f:
+        with open(Path(ckpt_dir) / "params.json") as f:
             params = json.loads(f.read())
 
         model_args: ModelArgs = ModelArgs(
@@ -154,15 +154,15 @@ class Llama3:
     @torch.inference_mode()
     def generate(
         self,
-        llm_inputs: List[LLMInput],
+        llm_inputs: list[LLMInput],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         echo: bool = False,
         print_model_input: bool = False,
-        logits_processor: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-    ) -> Generator[List[GenerationResult], None, None]:
+        logits_processor: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+    ) -> Generator[list[GenerationResult], None, None]:
         if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
             max_gen_len = self.args.max_seq_len - 1
         params = self.model.params
@@ -174,6 +174,7 @@ class Llama3:
                 cprint(
                     "Input to model:\n" + self.tokenizer.decode(tokens_to_print) + "\n",
                     "red",
+                    file=sys.stderr,
                 )
         prompt_tokens = [inp.tokens for inp in llm_inputs]
 
@@ -184,7 +185,11 @@ class Llama3:
         max_prompt_len = max(len(t) for t in prompt_tokens)
 
         if max_prompt_len >= params.max_seq_len:
-            cprint(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red")
+            cprint(
+                f"Out of token budget {max_prompt_len} vs {params.max_seq_len}",
+                color="red",
+                file=sys.stderr,
+            )
             return
 
         total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
@@ -302,13 +307,13 @@ class Llama3:
 
     def completion(
         self,
-        contents: List[RawContent],
+        contents: list[RawContent],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         echo: bool = False,
-    ) -> Generator[List[GenerationResult], None, None]:
+    ) -> Generator[list[GenerationResult], None, None]:
         model_inputs = [self.formatter.encode_content(c) for c in contents]
         for result in self.generate(
             model_inputs=model_inputs,
@@ -324,14 +329,14 @@ class Llama3:
 
     def chat_completion(
         self,
-        messages_batch: List[List[RawMessage]],
+        messages_batch: list[list[RawMessage]],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
         echo: bool = False,
-    ) -> Generator[List[GenerationResult], None, None]:
+    ) -> Generator[list[GenerationResult], None, None]:
         model_inputs = [self.formatter.encode_dialog_prompt(messages) for messages in messages_batch]
         for result in self.generate(
             model_inputs=model_inputs,
diff --git a/llama_stack/models/llama/llama3/interface.py b/llama_stack/models/llama/llama3/interface.py
index 8684237df..b63ba4847 100644
--- a/llama_stack/models/llama/llama3/interface.py
+++ b/llama_stack/models/llama/llama3/interface.py
@@ -12,7 +12,6 @@
 # the top-level of this source tree.
 
 from pathlib import Path
-from typing import List, Optional
 
 from termcolor import colored
 
@@ -131,7 +130,7 @@ class LLama31Interface:
         self.formatter = ChatFormat(self.tokenizer)
         self.tool_prompt_format = tool_prompt_format
 
-    def get_tokens(self, messages: List[RawMessage]) -> List[int]:
+    def get_tokens(self, messages: list[RawMessage]) -> list[int]:
         model_input = self.formatter.encode_dialog_prompt(
             messages,
             self.tool_prompt_format,
@@ -149,10 +148,10 @@ class LLama31Interface:
 
     def system_messages(
         self,
-        builtin_tools: List[BuiltinTool],
-        custom_tools: List[ToolDefinition],
-        instruction: Optional[str] = None,
-    ) -> List[RawMessage]:
+        builtin_tools: list[BuiltinTool],
+        custom_tools: list[ToolDefinition],
+        instruction: str | None = None,
+    ) -> list[RawMessage]:
         messages = []
 
         default_gen = SystemDefaultGenerator()
@@ -194,8 +193,8 @@ class LLama31Interface:
         self,
         content: str,
         stop_reason: StopReason,
-        tool_call: Optional[ToolCall] = None,
-    ) -> List[RawMessage]:
+        tool_call: ToolCall | None = None,
+    ) -> list[RawMessage]:
         tool_calls = []
         if tool_call:
             tool_calls.append(tool_call)
@@ -208,7 +207,7 @@ class LLama31Interface:
             )
         ]
 
-    def user_message(self, content: str) -> List[RawMessage]:
+    def user_message(self, content: str) -> list[RawMessage]:
         return [RawMessage(role="user", content=content)]
 
     def display_message_as_tokens(self, message: RawMessage) -> None:
@@ -228,7 +227,7 @@ class LLama31Interface:
         print("\n", end="")
 
 
-def list_jinja_templates() -> List[Template]:
+def list_jinja_templates() -> list[Template]:
     return TEMPLATES
 
 
diff --git a/llama_stack/models/llama/llama3/model.py b/llama_stack/models/llama/llama3/model.py
index 2562673e2..88f748c1d 100644
--- a/llama_stack/models/llama/llama3/model.py
+++ b/llama_stack/models/llama/llama3/model.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import math
-from typing import Optional, Tuple
 
 import fairscale.nn.model_parallel.initialize as fs_init
 import torch
@@ -80,7 +79,7 @@ def apply_rotary_emb(
     xq: torch.Tensor,
     xk: torch.Tensor,
     freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
@@ -162,7 +161,7 @@ class Attention(nn.Module):
         x: torch.Tensor,
         start_pos: int,
         freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
+        mask: torch.Tensor | None,
     ):
         bsz, seqlen, _ = x.shape
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
@@ -204,7 +203,7 @@ class FeedForward(nn.Module):
         dim: int,
         hidden_dim: int,
         multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
+        ffn_dim_multiplier: float | None,
     ):
         super().__init__()
         hidden_dim = int(2 * hidden_dim / 3)
@@ -243,7 +242,7 @@ class TransformerBlock(nn.Module):
         x: torch.Tensor,
         start_pos: int,
         freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
+        mask: torch.Tensor | None,
     ):
         h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
         out = h + self.feed_forward(self.ffn_norm(h))
diff --git a/llama_stack/models/llama/llama3/multimodal/image_transform.py b/llama_stack/models/llama/llama3/multimodal/image_transform.py
index c156d6d2e..f2761ee47 100644
--- a/llama_stack/models/llama/llama3/multimodal/image_transform.py
+++ b/llama_stack/models/llama/llama3/multimodal/image_transform.py
@@ -14,7 +14,7 @@
 import math
 from collections import defaultdict
 from logging import getLogger
-from typing import Any, Optional, Set, Tuple
+from typing import Any
 
 import torch
 import torchvision.transforms as tv
@@ -26,7 +26,7 @@ IMAGE_RES = 224
 logger = getLogger()
 
 
-class VariableSizeImageTransform(object):
+class VariableSizeImageTransform:
     """
     This class accepts images of any size and dynamically resize, pads and chunks it
     based on the image aspect ratio and the number of image chunks we allow.
@@ -75,7 +75,7 @@ class VariableSizeImageTransform(object):
         self.resample = tv.InterpolationMode.BILINEAR
 
     @staticmethod
-    def get_factors(n: int) -> Set[int]:
+    def get_factors(n: int) -> set[int]:
         """
         Calculate all factors of a given number, i.e. a dividor that leaves
         no remainder. For example, if n=12, it will return {1, 2, 3, 4, 6, 12}.
@@ -145,9 +145,9 @@ class VariableSizeImageTransform(object):
 
     @staticmethod
     def get_max_res_without_distortion(
-        image_size: Tuple[int, int],
-        target_size: Tuple[int, int],
-    ) -> Tuple[int, int]:
+        image_size: tuple[int, int],
+        target_size: tuple[int, int],
+    ) -> tuple[int, int]:
         """
         Determines the maximum resolution to which an image can be resized to without distorting its
         aspect ratio, based on the target resolution.
@@ -198,8 +198,8 @@ class VariableSizeImageTransform(object):
     def resize_without_distortion(
         self,
         image: torch.Tensor,
-        target_size: Tuple[int, int],
-        max_upscaling_size: Optional[int],
+        target_size: tuple[int, int],
+        max_upscaling_size: int | None,
     ) -> torch.Tensor:
         """
         Used to resize an image to target_resolution, without distortion.
@@ -261,10 +261,10 @@ class VariableSizeImageTransform(object):
 
     def get_best_fit(
         self,
-        image_size: Tuple[int, int],
+        image_size: tuple[int, int],
         possible_resolutions: torch.Tensor,
         resize_to_max_canvas: bool = False,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         """
         Determines the best canvas possible from a list of possible resolutions to, without distortion,
         resize an image to.
@@ -364,7 +364,7 @@ class VariableSizeImageTransform(object):
         max_num_chunks: int,
         normalize_img: bool = True,
         resize_to_max_canvas: bool = False,
-    ) -> Tuple[Any, Any]:
+    ) -> tuple[Any, Any]:
         """
         Args:
             image (PIL.Image): Image to be resized.
diff --git a/llama_stack/models/llama/llama3/multimodal/model.py b/llama_stack/models/llama/llama3/multimodal/model.py
index 0cb18b948..5f1c3605c 100644
--- a/llama_stack/models/llama/llama3/multimodal/model.py
+++ b/llama_stack/models/llama/llama3/multimodal/model.py
@@ -6,8 +6,9 @@
 
 import logging
 import math
+from collections.abc import Callable
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any
 
 import fairscale.nn.model_parallel.initialize as fs_init
 import torch
@@ -104,9 +105,9 @@ class ColumnParallelConv2dPatch(torch.nn.Module):
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]],
-        bias: Optional[bool] = False,
+        kernel_size: int | tuple[int, int],
+        stride: int | tuple[int, int],
+        bias: bool | None = False,
     ) -> None:
         super().__init__()
         if isinstance(kernel_size, int):
@@ -390,13 +391,13 @@ class VisionEncoder(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool = True,
-        missing_keys: List[str] = None,
-        unexpected_keys: List[str] = None,
-        error_msgs: List[str] = None,
+        missing_keys: list[str] = None,
+        unexpected_keys: list[str] = None,
+        error_msgs: list[str] = None,
         return_state_dict: bool = False,
     ) -> None:
         orig_pos_embed = state_dict.get(prefix + "positional_embedding")
@@ -641,7 +642,7 @@ class FeedForward(nn.Module):
         dim: int,
         hidden_dim: int,
         multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
+        ffn_dim_multiplier: float | None,
     ):
         """
         Initialize the FeedForward module.
@@ -983,7 +984,7 @@ class CrossAttentionTransformerBlock(torch.nn.Module):
         self,
         x: torch.Tensor,
         xattn_mask: torch.Tensor,
-        full_text_row_masked_out_mask: Tuple[torch.Tensor, torch.Tensor],
+        full_text_row_masked_out_mask: tuple[torch.Tensor, torch.Tensor],
         xattn_cache: torch.Tensor,
     ) -> torch.Tensor:
         _attn_out = self.attention(
@@ -1144,7 +1145,7 @@ class CrossAttentionTransformerText(torch.nn.Module):
     def _init_fusion_schedule(
         self,
         num_layers: int,
-    ) -> List[int]:
+    ) -> list[int]:
         llama_layers = list(range(self.n_llama_layers))
 
         # uniformly spread the layers
@@ -1231,7 +1232,7 @@ class CrossAttentionTransformerText(torch.nn.Module):
         text_dtype,
         vision_tokens,
         cross_attention_masks,
-    ) -> Tuple[Tensor, Tensor]:
+    ) -> tuple[Tensor, Tensor]:
         assert vision_tokens is not None, "Vision tokens must be provided"
         vision_seqlen = vision_tokens.shape[3]
         assert vision_tokens.shape[1] == cross_attention_masks.shape[2], (
@@ -1280,11 +1281,11 @@ class CrossAttentionTransformer(torch.nn.Module):
 
     def compute_vision_tokens_masks(
         self,
-        batch_images: List[List[PIL_Image.Image]],
-        batch_masks: List[List[List[int]]],
+        batch_images: list[list[PIL_Image.Image]],
+        batch_masks: list[list[list[int]]],
         total_len: int,
         device: torch.device,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         skip_vision_encoder = False
 
         assert len(batch_images) == len(batch_masks), "Images and masks must have the same length"
@@ -1371,11 +1372,11 @@ class CrossAttentionTransformer(torch.nn.Module):
 
 
 def _stack_images(
-    images: List[List[PIL_Image.Image]],
+    images: list[list[PIL_Image.Image]],
     max_num_chunks: int,
     image_res: int,
     max_num_images: int,
-) -> Tuple[torch.Tensor, List[int]]:
+) -> tuple[torch.Tensor, list[int]]:
     """
     Takes a list of list of images and stacks them into a tensor.
     This function is needed since images can be of completely
@@ -1400,8 +1401,8 @@ def _stack_images(
 
 
 def _pad_masks(
-    all_masks: List[List[List[int]]],
-    all_num_chunks: List[List[int]],
+    all_masks: list[list[list[int]]],
+    all_num_chunks: list[list[int]],
     total_len: int,
     max_num_chunks: int,
 ) -> torch.Tensor:
diff --git a/llama_stack/models/llama/llama3/prompt_templates/base.py b/llama_stack/models/llama/llama3/prompt_templates/base.py
index bff2a21e1..0081443be 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/base.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/base.py
@@ -12,7 +12,7 @@
 # the top-level of this source tree.
 
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any
 
 from jinja2 import Template
 
@@ -20,7 +20,7 @@ from jinja2 import Template
 @dataclass
 class PromptTemplate:
     template: str
-    data: Dict[str, Any]
+    data: dict[str, Any]
 
     def render(self):
         template = Template(self.template)
@@ -35,5 +35,5 @@ class PromptTemplateGeneratorBase:
     def gen(self, *args, **kwargs) -> PromptTemplate:
         raise NotImplementedError()
 
-    def data_examples(self) -> List[Any]:
+    def data_examples(self) -> list[Any]:
         raise NotImplementedError()
diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
index fbc0127fd..ab626e5af 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@@ -13,7 +13,7 @@
 
 import textwrap
 from datetime import datetime
-from typing import Any, List, Optional
+from typing import Any
 
 from llama_stack.apis.inference import (
     BuiltinTool,
@@ -39,12 +39,12 @@ class SystemDefaultGenerator(PromptTemplateGeneratorBase):
             },
         )
 
-    def data_examples(self) -> List[Any]:
+    def data_examples(self) -> list[Any]:
         return [None]
 
 
 class BuiltinToolGenerator(PromptTemplateGeneratorBase):
-    def _tool_breakdown(self, tools: List[ToolDefinition]):
+    def _tool_breakdown(self, tools: list[ToolDefinition]):
         builtin_tools, custom_tools = [], []
         for dfn in tools:
             if isinstance(dfn.tool_name, BuiltinTool):
@@ -54,7 +54,7 @@ class BuiltinToolGenerator(PromptTemplateGeneratorBase):
 
         return builtin_tools, custom_tools
 
-    def gen(self, tools: List[ToolDefinition]) -> PromptTemplate:
+    def gen(self, tools: list[ToolDefinition]) -> PromptTemplate:
         builtin_tools, custom_tools = self._tool_breakdown(tools)
         template_str = textwrap.dedent(
             """
@@ -75,7 +75,7 @@ class BuiltinToolGenerator(PromptTemplateGeneratorBase):
             },
         )
 
-    def data_examples(self) -> List[List[ToolDefinition]]:
+    def data_examples(self) -> list[list[ToolDefinition]]:
         return [
             # builtin tools
             [
@@ -91,7 +91,7 @@ class BuiltinToolGenerator(PromptTemplateGeneratorBase):
 
 
 class JsonCustomToolGenerator(PromptTemplateGeneratorBase):
-    def gen(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
+    def gen(self, custom_tools: list[ToolDefinition]) -> PromptTemplate:
         template_str = textwrap.dedent(
             """
             Answer the user's question by making use of the following functions if needed.
@@ -137,7 +137,7 @@ class JsonCustomToolGenerator(PromptTemplateGeneratorBase):
             {"custom_tools": [t.model_dump() for t in custom_tools]},
         )
 
-    def data_examples(self) -> List[List[ToolDefinition]]:
+    def data_examples(self) -> list[list[ToolDefinition]]:
         return [
             [
                 ToolDefinition(
@@ -161,7 +161,7 @@ class JsonCustomToolGenerator(PromptTemplateGeneratorBase):
 
 
 class FunctionTagCustomToolGenerator(PromptTemplateGeneratorBase):
-    def gen(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
+    def gen(self, custom_tools: list[ToolDefinition]) -> PromptTemplate:
         template_str = textwrap.dedent(
             """
             You have access to the following functions:
@@ -199,7 +199,7 @@ class FunctionTagCustomToolGenerator(PromptTemplateGeneratorBase):
             {"custom_tools": [t.model_dump() for t in custom_tools]},
         )
 
-    def data_examples(self) -> List[List[ToolDefinition]]:
+    def data_examples(self) -> list[list[ToolDefinition]]:
         return [
             [
                 ToolDefinition(
@@ -238,14 +238,14 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
         """.strip("\n")
     )
 
-    def gen(self, custom_tools: List[ToolDefinition], system_prompt: Optional[str] = None) -> PromptTemplate:
+    def gen(self, custom_tools: list[ToolDefinition], system_prompt: str | None = None) -> PromptTemplate:
         system_prompt = system_prompt or self.DEFAULT_PROMPT
         return PromptTemplate(
             system_prompt,
             {"function_description": self._gen_function_description(custom_tools)},
         )
 
-    def _gen_function_description(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
+    def _gen_function_description(self, custom_tools: list[ToolDefinition]) -> str:
         template_str = textwrap.dedent(
             """
             Here is a list of functions in JSON format that you can invoke.
@@ -286,12 +286,14 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
 
             """
         )
-        return PromptTemplate(
+        template = PromptTemplate(
             template_str.strip("\n"),
             {"tools": [t.model_dump() for t in custom_tools]},
-        ).render()
+        )
+        rendered: str = template.render()
+        return rendered
 
-    def data_examples(self) -> List[List[ToolDefinition]]:
+    def data_examples(self) -> list[list[ToolDefinition]]:
         return [
             [
                 ToolDefinition(
diff --git a/llama_stack/models/llama/llama3/prompt_templates/tool_response.py b/llama_stack/models/llama/llama3/prompt_templates/tool_response.py
index 3df4dac14..4da171279 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/tool_response.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/tool_response.py
@@ -12,7 +12,6 @@
 # the top-level of this source tree.
 
 import textwrap
-from typing import Optional
 
 from .base import PromptTemplate, PromptTemplateGeneratorBase
 
@@ -21,8 +20,8 @@ class ToolResponseGenerator(PromptTemplateGeneratorBase):
     def gen(
         self,
         status: str,
-        stdout: Optional[str] = None,
-        stderr: Optional[str] = None,
+        stdout: str | None = None,
+        stderr: str | None = None,
     ):
         assert status in [
             "success",
diff --git a/llama_stack/models/llama/llama3/quantization/loader.py b/llama_stack/models/llama/llama3/quantization/loader.py
index 771fd02be..436cfa6fa 100644
--- a/llama_stack/models/llama/llama3/quantization/loader.py
+++ b/llama_stack/models/llama/llama3/quantization/loader.py
@@ -6,7 +6,7 @@
 
 # type: ignore
 import os
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, cast
 
 import torch
 from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
@@ -37,9 +37,9 @@ def swiglu_wrapper(
 def convert_to_quantized_model(
     model: Transformer | CrossAttentionTransformer,
     checkpoint_dir: str,
-    quantization_mode: Optional[str] = None,
-    fp8_activation_scale_ub: Optional[float] = 1200.0,
-    device: Optional[torch.device] = None,
+    quantization_mode: str | None = None,
+    fp8_activation_scale_ub: float | None = 1200.0,
+    device: torch.device | None = None,
 ) -> Transformer | CrossAttentionTransformer:
     if quantization_mode == QuantizationMode.fp8_mixed:
         return convert_to_fp8_quantized_model(model, checkpoint_dir, fp8_activation_scale_ub, device)
@@ -52,8 +52,8 @@ def convert_to_quantized_model(
 def convert_to_fp8_quantized_model(
     model: Transformer,
     checkpoint_dir: str,
-    fp8_activation_scale_ub: Optional[float] = 1200.0,
-    device: Optional[torch.device] = None,
+    fp8_activation_scale_ub: float | None = 1200.0,
+    device: torch.device | None = None,
 ) -> Transformer:
     # Move weights to GPU with quantization
     fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt")
@@ -122,8 +122,8 @@ class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
         precision: torch.dtype = torch.float32,
         scales_precision: torch.dtype = torch.float32,
         # LoRA parameters
-        lora_rank: Optional[int] = None,
-        lora_scale: Optional[float] = None,
+        lora_rank: int | None = None,
+        lora_scale: float | None = None,
     ) -> None:
         super().__init__(
             in_features,
@@ -134,8 +134,8 @@ class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
             precision=precision,
             scales_precision=scales_precision,
         )
-        self.lora_scale: Optional[float] = None
-        self.adaptor: Optional[nn.Sequential] = None
+        self.lora_scale: float | None = None
+        self.adaptor: nn.Sequential | None = None
         if lora_rank is not None:
             assert lora_scale is not None, "Please specify lora scale for LoRA."
             # Low-rank adaptation. See paper for more details: https://arxiv.org/abs/2106.09685
@@ -147,13 +147,13 @@ class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         """A hook to load the quantized weights from the state dict."""
         if prefix + "zeros" not in state_dict:
@@ -191,13 +191,13 @@ class Int8WeightEmbedding(torch.nn.Embedding):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         """A hook to load the quantized embedding weight and scales from the state dict."""
         weights = state_dict.pop(prefix + "weight")
@@ -221,13 +221,13 @@ class Int8WeightLinear(torch.nn.Linear):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         """A hook to load the quantized linear weight and scales from the state dict."""
         weights = state_dict.pop(prefix + "weight")
@@ -238,8 +238,8 @@ class Int8WeightLinear(torch.nn.Linear):
 def _prepare_model_int4_weight_int8_dynamic_activation(
     model: torch.nn.Module,
     group_size: int,
-    lora_rank: Optional[int],
-    lora_scale: Optional[float],
+    lora_rank: int | None,
+    lora_scale: float | None,
 ):
     """Prepare the model for int4 weight and int8 dynamic activation quantization.
 
@@ -265,7 +265,7 @@ def _prepare_model_int4_weight_int8_dynamic_activation(
             )
             del module
             setattr(model, module_name, quantized_module)
-        elif isinstance(module, (ColumnParallelLinear, RowParallelLinear, nn.Linear)):
+        elif isinstance(module, ColumnParallelLinear | RowParallelLinear | nn.Linear):
             quantized_module = Int8DynActInt4WeightLinearLoRA(
                 in_features=module.in_features,
                 out_features=module.out_features,
@@ -286,7 +286,7 @@ def _prepare_model_int4_weight_int8_dynamic_activation(
 def convert_to_int4_quantized_model(
     model: Transformer | CrossAttentionTransformer,
     checkpoint_dir: str,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
 ) -> Transformer | CrossAttentionTransformer:
     """Convert the model to int4 quantized model."""
     model_args = model.params
diff --git a/llama_stack/models/llama/llama3/tokenizer.py b/llama_stack/models/llama/llama3/tokenizer.py
index d3cc4fc07..e5ada3599 100644
--- a/llama_stack/models/llama/llama3/tokenizer.py
+++ b/llama_stack/models/llama/llama3/tokenizer.py
@@ -5,18 +5,11 @@
 # the root directory of this source tree.
 
 import os
+from collections.abc import Collection, Iterator, Sequence, Set
 from logging import getLogger
 from pathlib import Path
 from typing import (
-    AbstractSet,
-    Collection,
-    Dict,
-    Iterator,
-    List,
     Literal,
-    Optional,
-    Sequence,
-    Union,
     cast,
 )
 
@@ -44,7 +37,7 @@ class Tokenizer:
     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
     """
 
-    special_tokens: Dict[str, int]
+    special_tokens: dict[str, int]
 
     num_reserved_special_tokens = 256
 
@@ -116,9 +109,9 @@ class Tokenizer:
         *,
         bos: bool,
         eos: bool,
-        allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
-        disallowed_special: Union[Literal["all"], Collection[str]] = (),
-    ) -> List[int]:
+        allowed_special: Literal["all"] | Set[str] | None = None,
+        disallowed_special: Literal["all"] | Collection[str] = (),
+    ) -> list[int]:
         """
         Encodes a string into a list of token IDs.
 
@@ -151,7 +144,7 @@ class Tokenizer:
                 s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
             )
         )
-        t: List[int] = []
+        t: list[int] = []
         for substr in substrs:
             t.extend(
                 self.model.encode(
@@ -177,7 +170,7 @@ class Tokenizer:
             str: The decoded string.
         """
         # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
-        return self.model.decode(cast(List[int], t))
+        return self.model.decode(cast(list[int], t))
 
     @staticmethod
     def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
diff --git a/llama_stack/models/llama/llama3/tool_utils.py b/llama_stack/models/llama/llama3/tool_utils.py
index 91b46ec98..574080184 100644
--- a/llama_stack/models/llama/llama3/tool_utils.py
+++ b/llama_stack/models/llama/llama3/tool_utils.py
@@ -6,7 +6,6 @@
 
 import json
 import re
-from typing import Optional, Tuple
 
 from llama_stack.log import get_logger
 
@@ -172,7 +171,7 @@ class ToolUtils:
         return match is not None
 
     @staticmethod
-    def maybe_extract_builtin_tool_call(message_body: str) -> Optional[Tuple[str, str]]:
+    def maybe_extract_builtin_tool_call(message_body: str) -> tuple[str, str] | None:
         # Find the first match in the text
         match = re.search(BUILTIN_TOOL_PATTERN, message_body)
 
@@ -185,7 +184,7 @@ class ToolUtils:
             return None
 
     @staticmethod
-    def maybe_extract_custom_tool_call(message_body: str) -> Optional[Tuple[str, str]]:
+    def maybe_extract_custom_tool_call(message_body: str) -> tuple[str, str] | None:
         # NOTE: Custom function too calls are still experimental
         # Sometimes, response is of the form
         # {"type": "function", "name": "function_name", "parameters": {...}
@@ -252,7 +251,7 @@ class ToolUtils:
                 def format_value(value: RecursiveType) -> str:
                     if isinstance(value, str):
                         return f'"{value}"'
-                    elif isinstance(value, (int, float, bool)) or value is None:
+                    elif isinstance(value, int | float | bool) or value is None:
                         return str(value)
                     elif isinstance(value, list):
                         return f"[{', '.join(format_value(v) for v in value)}]"
diff --git a/llama_stack/models/llama/llama3_1/prompts.py b/llama_stack/models/llama/llama3_1/prompts.py
index 9dcc51dc8..579a5ee02 100644
--- a/llama_stack/models/llama/llama3_1/prompts.py
+++ b/llama_stack/models/llama/llama3_1/prompts.py
@@ -12,7 +12,6 @@
 # the top-level of this source tree.
 
 import textwrap
-from typing import List
 
 from llama_stack.models.llama.datatypes import (
     BuiltinTool,
@@ -73,7 +72,7 @@ def wolfram_alpha_response():
     )
 
 
-def usecases() -> List[UseCase | str]:
+def usecases() -> list[UseCase | str]:
     return [
         textwrap.dedent(
             """
diff --git a/llama_stack/models/llama/llama3_3/prompts.py b/llama_stack/models/llama/llama3_3/prompts.py
index 194e4fa26..60349e578 100644
--- a/llama_stack/models/llama/llama3_3/prompts.py
+++ b/llama_stack/models/llama/llama3_3/prompts.py
@@ -12,7 +12,6 @@
 # the top-level of this source tree.
 
 import textwrap
-from typing import List
 
 from llama_stack.models.llama.datatypes import (
     BuiltinTool,
@@ -74,7 +73,7 @@ def wolfram_alpha_response():
     )
 
 
-def usecases() -> List[UseCase | str]:
+def usecases() -> list[UseCase | str]:
     return [
         textwrap.dedent(
             """
diff --git a/llama_stack/models/llama/llama4/args.py b/llama_stack/models/llama/llama4/args.py
index dd5f7cbde..523d6ed10 100644
--- a/llama_stack/models/llama/llama4/args.py
+++ b/llama_stack/models/llama/llama4/args.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Optional
 
 from pydantic import BaseModel, model_validator
 
@@ -15,8 +14,8 @@ class QuantizationScheme(Enum):
 
 
 class QuantizationArgs(BaseModel):
-    scheme: Optional[QuantizationScheme] = None
-    group_size: Optional[int] = None
+    scheme: QuantizationScheme | None = None
+    group_size: int | None = None
     spinquant: bool = False
 
 
@@ -58,32 +57,32 @@ class ModelArgs(BaseModel):
     dim: int = -1
     n_layers: int = -1
     n_heads: int = -1
-    n_kv_heads: Optional[int] = None
-    head_dim: Optional[int] = None
+    n_kv_heads: int | None = None
+    head_dim: int | None = None
 
     vocab_size: int = -1
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
-    ffn_exp: Optional[float] = None
+    ffn_dim_multiplier: float | None = None
+    ffn_exp: float | None = None
     norm_eps: float = 1e-5
 
-    attention_chunk_size: Optional[int] = None
+    attention_chunk_size: int | None = None
     rope_theta: float = 500000
     use_scaled_rope: bool = False
-    rope_scaling_factor: Optional[float] = None
-    rope_high_freq_factor: Optional[float] = None
+    rope_scaling_factor: float | None = None
+    rope_high_freq_factor: float | None = None
 
-    nope_layer_interval: Optional[int] = None  # No position encoding in every n layers
+    nope_layer_interval: int | None = None  # No position encoding in every n layers
     use_qk_norm: bool = False
     # Set to True to enable inference-time temperature tuning (useful for very long context)
     attn_temperature_tuning: bool = False
     floor_scale: float = 8192.0
     attn_scale: float = 0.1
 
-    vision_args: Optional[VisionArgs] = None
-    moe_args: Optional[MoEArgs] = None
-    quantization_args: Optional[QuantizationArgs] = None
-    lora_args: Optional[LoRAArgs] = None
+    vision_args: VisionArgs | None = None
+    moe_args: MoEArgs | None = None
+    quantization_args: QuantizationArgs | None = None
+    lora_args: LoRAArgs | None = None
 
     max_batch_size: int = 32
     max_seq_len: int = 2048
diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py
index 1574eeb5e..96ebd0881 100644
--- a/llama_stack/models/llama/llama4/chat_format.py
+++ b/llama_stack/models/llama/llama4/chat_format.py
@@ -8,7 +8,6 @@ import io
 import json
 import uuid
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
 
 import torch
 from PIL import Image as PIL_Image
@@ -46,10 +45,10 @@ def role_str(role: Role) -> str:
 class TransformedImage:
     image_tiles: torch.Tensor
     # is the aspect ratio needed anywhere?
-    aspect_ratio: Tuple[int, int]
+    aspect_ratio: tuple[int, int]
 
 
-def convert_image_to_rgb(image: PIL_Image.Image, bg: Tuple[int, int, int] = (255, 255, 255)) -> PIL_Image.Image:
+def convert_image_to_rgb(image: PIL_Image.Image, bg: tuple[int, int, int] = (255, 255, 255)) -> PIL_Image.Image:
     if image.mode == "RGBA":
         image.load()  # for png.split()
         new_img = PIL_Image.new("RGB", image.size, bg)
@@ -59,12 +58,12 @@ def convert_image_to_rgb(image: PIL_Image.Image, bg: Tuple[int, int, int] = (255
 
 
 class ChatFormat:
-    possible_headers: Dict[Role, str]
+    possible_headers: dict[Role, str]
 
     def __init__(
         self,
         tokenizer: Tokenizer,
-        vision_args: Optional[VisionArgs] = None,
+        vision_args: VisionArgs | None = None,
         max_num_chunks: int = 16,
     ):
         self.tokenizer = tokenizer
@@ -81,7 +80,7 @@ class ChatFormat:
                 vision_args.image_size.width, vision_args.image_size.height
             )
 
-    def _encode_header(self, role: str) -> List[int]:
+    def _encode_header(self, role: str) -> list[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|header_start|>"])
 
@@ -98,7 +97,7 @@ class ChatFormat:
     def _encode_image(
         self,
         transformed_image: TransformedImage,
-    ) -> List[int]:
+    ) -> list[int]:
         assert self.vision_args is not None, "The model is not vision-enabled"
 
         image_tensor = transformed_image.image_tiles
@@ -140,7 +139,7 @@ class ChatFormat:
 
         return tokens
 
-    def _encode_content(self, content: RawContent, bos: bool = False) -> Tuple[List[int], List[TransformedImage]]:
+    def _encode_content(self, content: RawContent, bos: bool = False) -> tuple[list[int], list[TransformedImage]]:
         tokens = []
         tranformed_images = []
 
@@ -189,7 +188,7 @@ class ChatFormat:
 
     def encode_message(
         self, message: RawMessage, tool_prompt_format: ToolPromptFormat
-    ) -> Tuple[List[int], List[TransformedImage]]:
+    ) -> tuple[list[int], list[TransformedImage]]:
         tokens = self._encode_header(message.role)
         images = []
 
@@ -223,7 +222,7 @@ class ChatFormat:
 
     def encode_dialog_prompt(
         self,
-        messages: List[RawMessage],
+        messages: list[RawMessage],
         tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
     ) -> LLMInput:
         tokens = []
@@ -240,7 +239,7 @@ class ChatFormat:
         return self._model_input_from_tokens_images(tokens, images)
 
     # TODO(this should be generic, not only for assistant messages)
-    def decode_assistant_message(self, tokens: List[int], stop_reason: StopReason) -> RawMessage:
+    def decode_assistant_message(self, tokens: list[int], stop_reason: StopReason) -> RawMessage:
         content = self.tokenizer.decode(tokens)
 
         return self.decode_assistant_message_from_content(content, stop_reason)
@@ -312,7 +311,7 @@ class ChatFormat:
             tool_calls=tool_calls,
         )
 
-    def _model_input_from_tokens_images(self, tokens: List[int], images: List[TransformedImage]) -> LLMInput:
+    def _model_input_from_tokens_images(self, tokens: list[int], images: list[TransformedImage]) -> LLMInput:
         return LLMInput(
             tokens=tokens,
             images=[x.image_tiles for x in images] if len(images) > 0 else None,
diff --git a/llama_stack/models/llama/llama4/datatypes.py b/llama_stack/models/llama/llama4/datatypes.py
index 27174db63..24d8ae948 100644
--- a/llama_stack/models/llama/llama4/datatypes.py
+++ b/llama_stack/models/llama/llama4/datatypes.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import List, Optional, Union
 
 import torch
 
@@ -30,7 +29,7 @@ class LLMInput:
     tokens: torch.Tensor
 
     # images are already pre-processed (resized, tiled, etc.)
-    images: Optional[List[torch.Tensor]] = None
+    images: list[torch.Tensor] | None = None
 
 
 @dataclass
@@ -45,8 +44,8 @@ class TransformerInput:
     # tokens_position defines the position of the tokens in each batch,
     # - when it is a tensor ([batch_size,]), it is the start position of the tokens in each batch
     # - when it is an int, the start position are the same for all batches
-    tokens_position: Union[torch.Tensor, int]
-    image_embedding: Optional[MaskedEmbedding] = None
+    tokens_position: torch.Tensor | int
+    image_embedding: MaskedEmbedding | None = None
 
 
 @dataclass
diff --git a/llama_stack/models/llama/llama4/ffn.py b/llama_stack/models/llama/llama4/ffn.py
index 9c9fca5fc..6584f1a2a 100644
--- a/llama_stack/models/llama/llama4/ffn.py
+++ b/llama_stack/models/llama/llama4/ffn.py
@@ -11,7 +11,7 @@
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 
-from typing import Any, Dict, List
+from typing import Any
 
 from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
@@ -36,13 +36,13 @@ class FeedForward(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         if prefix + "mlp.fc1_weight" in state_dict:
             w1, w3 = state_dict.pop(prefix + "mlp.fc1_weight").chunk(2, dim=0)
diff --git a/llama_stack/models/llama/llama4/generation.py b/llama_stack/models/llama/llama4/generation.py
index 8e94bb33a..6132d25d4 100644
--- a/llama_stack/models/llama/llama4/generation.py
+++ b/llama_stack/models/llama/llama4/generation.py
@@ -10,8 +10,8 @@ import json
 import os
 import sys
 import time
+from collections.abc import Callable, Generator
 from pathlib import Path
-from typing import Callable, Generator, List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -38,8 +38,8 @@ class Llama4:
         ckpt_dir: str,
         max_seq_len: int,
         max_batch_size: int,
-        world_size: Optional[int] = None,
-        quantization_mode: Optional[QuantizationMode] = None,
+        world_size: int | None = None,
+        quantization_mode: QuantizationMode | None = None,
         seed: int = 1,
     ):
         if not torch.distributed.is_initialized():
@@ -63,7 +63,7 @@ class Llama4:
         ckpt_paths = sorted(Path(ckpt_dir).glob("*.pth"))
         assert len(ckpt_paths) > 0, f"no checkpoint files found in {ckpt_dir}"
         print(f"Loading a checkpoint (shards={len(ckpt_paths)}, current-mp-size={world_size})")
-        with open(Path(ckpt_dir) / "params.json", "r") as f:
+        with open(Path(ckpt_dir) / "params.json") as f:
             params = json.loads(f.read())
 
         model_args: ModelArgs = ModelArgs(
@@ -117,15 +117,15 @@ class Llama4:
     @torch.inference_mode()
     def generate(
         self,
-        llm_inputs: List[LLMInput],
+        llm_inputs: list[LLMInput],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         echo: bool = False,
         print_model_input: bool = False,
-        logits_processor: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-    ) -> Generator[List[GenerationResult], None, None]:
+        logits_processor: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+    ) -> Generator[list[GenerationResult], None, None]:
         if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.model.args.max_seq_len:
             max_gen_len = self.model.args.max_seq_len - 1
 
@@ -133,9 +133,9 @@ class Llama4:
 
         print_model_input = print_model_input or os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1"
         if print_model_input:
-            cprint("Input to model:\n", "yellow")
+            cprint("Input to model:\n", color="yellow", file=sys.stderr)
             for inp in llm_inputs:
-                cprint(self.tokenizer.decode(inp.tokens), "grey")
+                cprint(self.tokenizer.decode(inp.tokens), color="grey", file=sys.stderr)
         prompt_tokens = [inp.tokens for inp in llm_inputs]
 
         bsz = len(llm_inputs)
@@ -145,7 +145,7 @@ class Llama4:
         max_prompt_len = max(len(t) for t in prompt_tokens)
 
         if max_prompt_len >= params.max_seq_len:
-            cprint(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red")
+            cprint(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", color="red", file=sys.stderr)
             return
 
         total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
@@ -245,13 +245,13 @@ class Llama4:
 
     def completion(
         self,
-        contents: List[RawContent],
+        contents: list[RawContent],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         echo: bool = False,
-    ) -> Generator[List[GenerationResult], None, None]:
+    ) -> Generator[list[GenerationResult], None, None]:
         llm_inputs = [self.formatter.encode_content(c) for c in contents]
         for result in self.generate(
             llm_inputs=llm_inputs,
@@ -267,13 +267,13 @@ class Llama4:
 
     def chat_completion(
         self,
-        messages_batch: List[List[RawMessage]],
+        messages_batch: list[list[RawMessage]],
         temperature: float = 0.6,
         top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
+        max_gen_len: int | None = None,
         logprobs: bool = False,
         echo: bool = False,
-    ) -> Generator[List[GenerationResult], None, None]:
+    ) -> Generator[list[GenerationResult], None, None]:
         llm_inputs = [self.formatter.encode_dialog_prompt(messages) for messages in messages_batch]
         for result in self.generate(
             llm_inputs=llm_inputs,
diff --git a/llama_stack/models/llama/llama4/model.py b/llama_stack/models/llama/llama4/model.py
index 2272b868d..4fb1181f7 100644
--- a/llama_stack/models/llama/llama4/model.py
+++ b/llama_stack/models/llama/llama4/model.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import math
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import fairscale.nn.model_parallel.initialize as fs_init
 import torch
@@ -89,7 +89,7 @@ def apply_rotary_emb(
     xq: torch.Tensor,
     xk: torch.Tensor,
     freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
     freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
@@ -174,13 +174,13 @@ class Attention(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         if prefix + "wqkv.weight" in state_dict:
             wqkv = state_dict.pop(prefix + "wqkv.weight")
@@ -200,7 +200,7 @@ class Attention(nn.Module):
         x: torch.Tensor,
         start_pos: int,
         freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
     ):
         bsz, seqlen, _ = x.shape
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
@@ -288,13 +288,13 @@ class TransformerBlock(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         if prefix + "attention.wqkv.layer_norm_weight" in state_dict:
             state_dict[prefix + "attention_norm.weight"] = state_dict.pop(prefix + "attention.wqkv.layer_norm_weight")
@@ -318,8 +318,8 @@ class TransformerBlock(nn.Module):
         x: torch.Tensor,
         start_pos: int,
         freqs_cis: torch.Tensor,
-        global_attn_mask: Optional[torch.Tensor],
-        local_attn_mask: Optional[torch.Tensor],
+        global_attn_mask: torch.Tensor | None,
+        local_attn_mask: torch.Tensor | None,
     ):
         # The iRoPE architecture uses global attention mask for NoPE layers or
         # if chunked local attention is not used
@@ -374,13 +374,13 @@ class Transformer(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         if prefix + "rope.freqs" in state_dict:
             state_dict.pop(prefix + "rope.freqs")
diff --git a/llama_stack/models/llama/llama4/moe.py b/llama_stack/models/llama/llama4/moe.py
index 2ce49e915..7475963d3 100644
--- a/llama_stack/models/llama/llama4/moe.py
+++ b/llama_stack/models/llama/llama4/moe.py
@@ -6,7 +6,7 @@
 
 # ruff: noqa: N806
 # pyre-strict
-from typing import Any, Dict, List
+from typing import Any
 
 import fairscale.nn.model_parallel.initialize as fs_init
 import torch
@@ -63,13 +63,13 @@ class Experts(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         self.prefix = prefix
         if prefix + "moe_w_in_eD_F" in state_dict:
@@ -158,13 +158,13 @@ class MoE(torch.nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
+        missing_keys: list[str],
+        unexpected_keys: list[str],
+        error_msgs: list[str],
     ) -> None:
         if prefix + "w_in_shared_FD.weight" in state_dict:
             state_dict[prefix + "shared_expert.w1.weight"] = state_dict.pop(prefix + "w_in_shared_FD.weight")
@@ -210,5 +210,5 @@ class MoE(torch.nn.Module):
 
 
 def divide_exact(numerator: int, denominator: int) -> int:
-    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+    assert numerator % denominator == 0, f"{numerator} is not divisible by {denominator}"
     return numerator // denominator
diff --git a/llama_stack/models/llama/llama4/preprocess.py b/llama_stack/models/llama/llama4/preprocess.py
index 689680779..7527a9987 100644
--- a/llama_stack/models/llama/llama4/preprocess.py
+++ b/llama_stack/models/llama/llama4/preprocess.py
@@ -13,7 +13,6 @@
 
 import math
 from collections import defaultdict
-from typing import Optional, Set, Tuple
 
 import torch
 import torchvision.transforms as tv
@@ -52,7 +51,7 @@ class ResizeNormalizeImageTransform:
         return self.tv_transform(image)
 
 
-class VariableSizeImageTransform(object):
+class VariableSizeImageTransform:
     """
     This class accepts images of any size and dynamically resize, pads and chunks it
     based on the image aspect ratio and the number of image chunks we allow.
@@ -100,7 +99,7 @@ class VariableSizeImageTransform(object):
         self.resample = tv.InterpolationMode.BILINEAR
 
     @staticmethod
-    def get_factors(n: int) -> Set[int]:
+    def get_factors(n: int) -> set[int]:
         """
         Calculate all factors of a given number, i.e. a dividor that leaves
         no remainder. For example, if n=12, it will return {1, 2, 3, 4, 6, 12}.
@@ -170,9 +169,9 @@ class VariableSizeImageTransform(object):
 
     @staticmethod
     def get_max_res_without_distortion(
-        image_size: Tuple[int, int],
-        target_size: Tuple[int, int],
-    ) -> Tuple[int, int]:
+        image_size: tuple[int, int],
+        target_size: tuple[int, int],
+    ) -> tuple[int, int]:
         """
         Determines the maximum resolution to which an image can be resized to without distorting its
         aspect ratio, based on the target resolution.
@@ -223,8 +222,8 @@ class VariableSizeImageTransform(object):
     def resize_without_distortion(
         self,
         image: torch.Tensor,
-        target_size: Tuple[int, int],
-        max_upscaling_size: Optional[int],
+        target_size: tuple[int, int],
+        max_upscaling_size: int | None,
     ) -> torch.Tensor:
         """
         Used to resize an image to target_resolution, without distortion.
@@ -289,10 +288,10 @@ class VariableSizeImageTransform(object):
 
     def get_best_fit(
         self,
-        image_size: Tuple[int, int],
+        image_size: tuple[int, int],
         possible_resolutions: torch.Tensor,
         resize_to_max_canvas: bool = False,
-    ) -> Tuple[int, int]:
+    ) -> tuple[int, int]:
         """
         Determines the best canvas possible from a list of possible resolutions to, without distortion,
         resize an image to.
@@ -392,7 +391,7 @@ class VariableSizeImageTransform(object):
         max_num_chunks: int,
         normalize_img: bool = True,
         resize_to_max_canvas: bool = False,
-    ) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    ) -> tuple[torch.Tensor, tuple[int, int]]:
         """
         Args:
             image (PIL.Image): Image to be resized.
diff --git a/llama_stack/models/llama/llama4/prompt_format.md b/llama_stack/models/llama/llama4/prompt_format.md
index 698571093..7ae998310 100644
--- a/llama_stack/models/llama/llama4/prompt_format.md
+++ b/llama_stack/models/llama/llama4/prompt_format.md
@@ -64,7 +64,7 @@ This example passes an image that is smaller than the tile size, to show the til
 
 ##### Model Response Format
 ```
-The image depicts a dog standing on a skateboard, with its front paws positioned on the board and its back paws hanging off the back. The dog has a distinctive coat pattern, featuring a white face, brown and black fur, and white paws, and is standing on a skateboard with red wheels, set against a blurred background of a street or alleyway with a teal door and beige wall.<|eot|>
+The image depicts a dog standing on a skateboard, positioned centrally and facing the camera directly. The dog has a distinctive coat pattern featuring white, black, and brown fur, with floppy ears and a black nose, and is standing on a skateboard with red wheels.<|eot|>
 ```
 
 
@@ -91,7 +91,7 @@ Here is an example of how to pass an image to the model
 
 ##### Model Response Format
 ```
-This image shows a dog standing on a skateboard, with its front paws positioned near the front of the board and its back paws near the back. The dog has a white, black, and orange coat, and is standing on a gray skateboard with red wheels, in front of a blurred background that appears to be a street or alleyway.<|eot|>
+The image depicts a dog standing on a skateboard, with the dog positioned centrally and facing forward. The dog has a distinctive coat featuring a mix of white, brown, and black fur, and is wearing a collar as it stands on the skateboard, which has red wheels.<|eot|>
 ```
 
 
@@ -117,7 +117,7 @@ Here is an example of how to pass an image to the model
 
 ##### Model Response Format
 ```
-The first image shows a dog standing on a skateboard, while the second image shows a plate of spaghetti with tomato sauce, parmesan cheese, and parsley. The two images are unrelated, with the first image featuring a dog and the second image featuring a food dish, and they do not share any common elements or themes.<|eot|>
+The first image features a dog standing on a skateboard, while the second image showcases a plate of spaghetti with tomato sauce and cheese. The two images appear to be unrelated, with one depicting a playful scene of a dog on a skateboard and the other presenting a classic Italian dish.<|eom|>
 ```
 
 
@@ -135,25 +135,52 @@ We are continuing the format for zero shot function calling used in previous ver
 ```
 <|begin_of_text|><|header_start|>system<|header_end|>
 
-You are an expert in composing functions. You are given a question and a set of possible functions.
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
-also point it out. You should only return the function call in tools call sections.
+You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:
 
-If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-You SHOULD NOT include any other text in the response.
+1. FUNCTION CALLS:
+- ONLY use functions that are EXPLICITLY listed in the function list below
+- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+- If a function is not in the list, respond ONLY with internal knowledge or "I don't have access to [Unavailable service] information"
+- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)
+- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]
+Examples:
+CORRECT: [get_weather(location="Vancouver"), calculate_route(start="Boston", end="New York")] <- Only if get_weather and calculate_route are in function list
+INCORRECT: get_weather(location="New York")
+INCORRECT: Let me check the weather: [get_weather(location="New York")]
+INCORRECT: [get_events(location="Singapore")] <- If function not in list
 
-Here is a list of functions in JSON format that you can invoke.
+2. RESPONSE RULES:
+- For pure function requests matching a listed function: ONLY output the function call(s)
+- For knowledge questions: ONLY output text
+- For missing parameters: ONLY request the specific missing parameters
+- For unavailable services (not in function list): output ONLY with internal knowledge or "I don't have access to [Unavailable service] information". Do NOT execute a function call.
+- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations
+- NEVER combine text and function calls in the same response
+- NEVER suggest alternative functions when the requested service is unavailable
+- NEVER create or invent new functions not listed below
 
+3. STRICT BOUNDARIES:
+- ONLY use functions from the list below - no exceptions
+- NEVER use a function as an alternative to unavailable information
+- NEVER call functions not present in the function list
+- NEVER add explanatory text to function calls
+- NEVER respond with empty brackets
+- Use proper Python/JSON syntax for function calls
+- Check the function list carefully before responding
+
+4. TOOL RESPONSE HANDLING:
+- When receiving tool responses: provide concise, natural language responses
+- Don't repeat tool response verbatim
+- Don't add supplementary information
+
+Here is a list of functions in JSON format that you can invoke:
 [
     {
         "name": "get_weather",
         "description": "Get weather info for places",
         "parameters": {
             "type": "dict",
-            "required": [
-                "city"
-            ],
+            "required": ["city"],
             "properties": {
                 "city": {
                     "type": "string",
@@ -167,7 +194,7 @@ Here is a list of functions in JSON format that you can invoke.
             }
         }
     }
-<|eot|><|header_start|>user<|header_end|>
+]<|eot|><|header_start|>user<|header_end|>
 
 What is the weather in SF and Seattle?<|eot|><|header_start|>assistant<|header_end|>
 
@@ -176,7 +203,7 @@ What is the weather in SF and Seattle?<|eot|><|header_start|>assistant<|header_e
 
 ##### Model Response Format
 ```
-[get_weather(city='SF'), get_weather(city='Seattle')]<|eot|>
+[get_weather(city="San Francisco"), get_weather(city="Seattle")]<|eot|>
 ```
 
 
@@ -273,5 +300,5 @@ Use tools to get latest trending songs<|eot|><|header_start|>assistant<|header_e
 
 ##### Model Response Format
 ```
-{"n": "10"}<|eot|>
+{"n": 10}<|eot|>
 ```
diff --git a/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
index 139e204ad..9c19f89ae 100644
--- a/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
@@ -12,7 +12,6 @@
 # the top-level of this source tree.
 
 import textwrap
-from typing import List, Optional
 
 from llama_stack.apis.inference import ToolDefinition, ToolParamDefinition
 from llama_stack.models.llama.llama3.prompt_templates.base import (
@@ -62,23 +61,21 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
         - Don't repeat tool response verbatim
         - Don't add supplementary information
 
-
         {{ function_description }}
         """.strip("\n")
     )
 
-    def gen(self, custom_tools: List[ToolDefinition], system_prompt: Optional[str] = None) -> PromptTemplate:
+    def gen(self, custom_tools: list[ToolDefinition], system_prompt: str | None = None) -> PromptTemplate:
         system_prompt = system_prompt or self.DEFAULT_PROMPT
         return PromptTemplate(
             system_prompt,
             {"function_description": self._gen_function_description(custom_tools)},
         )
 
-    def _gen_function_description(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
+    def _gen_function_description(self, custom_tools: list[ToolDefinition]) -> PromptTemplate:
         template_str = textwrap.dedent(
             """
-            Here is a list of functions in JSON format that you can invoke.
-
+            Here is a list of functions in JSON format that you can invoke:
             [
                 {% for t in tools -%}
                 {# manually setting up JSON because jinja sorts keys in unexpected ways -#}
@@ -109,10 +106,6 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
                 {% endif -%}
                 {%- endfor %}
             ]
-
-            You can answer general questions or invoke tools when necessary.
-            In addition to tool calls, you should also augment your responses by using the tool outputs.
-
             """
         )
         return PromptTemplate(
@@ -120,7 +113,7 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
             {"tools": [t.model_dump() for t in custom_tools]},
         ).render()
 
-    def data_examples(self) -> List[List[ToolDefinition]]:
+    def data_examples(self) -> list[list[ToolDefinition]]:
         return [
             [
                 ToolDefinition(
diff --git a/llama_stack/models/llama/llama4/prompts.py b/llama_stack/models/llama/llama4/prompts.py
index 13b96359a..2da94db7b 100644
--- a/llama_stack/models/llama/llama4/prompts.py
+++ b/llama_stack/models/llama/llama4/prompts.py
@@ -7,7 +7,10 @@
 import textwrap
 from io import BytesIO
 from pathlib import Path
-from typing import List
+
+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator,
+)
 
 from ..datatypes import RawMediaItem, RawMessage, RawTextItem
 from ..prompt_format import (
@@ -19,7 +22,7 @@ from ..prompt_format import (
 THIS_DIR = Path(__file__).parent
 
 
-def usecases(base_model: bool = False) -> List[UseCase | str]:
+def usecases(base_model: bool = False) -> list[UseCase | str]:
     with open(THIS_DIR.parent / "resources/small_dog.jpg", "rb") as f:
         img_small_dog = f.read()
     with open(THIS_DIR.parent / "resources/dog.jpg", "rb") as f:
@@ -177,39 +180,9 @@ def usecases(base_model: bool = False) -> List[UseCase | str]:
                     [
                         RawMessage(
                             role="system",
-                            content="""You are an expert in composing functions. You are given a question and a set of possible functions.
-Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
-also point it out. You should only return the function call in tools call sections.
-
-If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
-You SHOULD NOT include any other text in the response.
-
-Here is a list of functions in JSON format that you can invoke.
-
-[
-    {
-        "name": "get_weather",
-        "description": "Get weather info for places",
-        "parameters": {
-            "type": "dict",
-            "required": [
-                "city"
-            ],
-            "properties": {
-                "city": {
-                    "type": "string",
-                    "description": "The name of the city to get the weather for"
-                },
-                "metric": {
-                    "type": "string",
-                    "description": "The metric for weather. Options are: celsius, fahrenheit",
-                    "default": "celsius"
-                }
-            }
-        }
-    }
-""",
+                            content=PythonListCustomToolGenerator()
+                            .gen(PythonListCustomToolGenerator().data_examples()[0])
+                            .render(),
                         ),
                         RawMessage(
                             role="user",
diff --git a/llama_stack/models/llama/llama4/quantization/loader.py b/llama_stack/models/llama/llama4/quantization/loader.py
index f11d83c60..223744a5f 100644
--- a/llama_stack/models/llama/llama4/quantization/loader.py
+++ b/llama_stack/models/llama/llama4/quantization/loader.py
@@ -6,7 +6,7 @@
 
 import logging
 import os
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
@@ -45,8 +45,8 @@ def experts_batched_swiglu_wrapper(
 def convert_to_quantized_model(
     model: Transformer,
     checkpoint_dir: str,
-    quantization_mode: Optional[str] = None,
-    fp8_activation_scale_ub: Optional[float] = 1200.0,
+    quantization_mode: str | None = None,
+    fp8_activation_scale_ub: float | None = 1200.0,
     use_rich_progress: bool = True,
 ) -> Transformer:
     from ...quantize_impls import (
@@ -213,7 +213,7 @@ def logging_callbacks(
         )
         task_id = progress.add_task("[blue]Converting layers...", total=total_blocks, status="Starting")
 
-    def update_status(message: Optional[str], completed: Optional[int] = None) -> None:
+    def update_status(message: str | None, completed: int | None = None) -> None:
         if use_rich_progress:
             if message is not None:
                 progress.update(task_id, status=message)
diff --git a/llama_stack/models/llama/llama4/tokenizer.model b/llama_stack/models/llama/llama4/tokenizer.model
old mode 100755
new mode 100644
diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py
index 0d2cc7ce5..74070d43e 100644
--- a/llama_stack/models/llama/llama4/tokenizer.py
+++ b/llama_stack/models/llama/llama4/tokenizer.py
@@ -5,18 +5,11 @@
 # the root directory of this source tree.
 
 import os
+from collections.abc import Collection, Iterator, Sequence, Set
 from logging import getLogger
 from pathlib import Path
 from typing import (
-    AbstractSet,
-    Collection,
-    Dict,
-    Iterator,
-    List,
     Literal,
-    Optional,
-    Sequence,
-    Union,
     cast,
 )
 
@@ -114,7 +107,7 @@ class Tokenizer:
     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
     """
 
-    special_tokens: Dict[str, int]
+    special_tokens: dict[str, int]
 
     num_reserved_special_tokens = 2048
 
@@ -182,9 +175,9 @@ class Tokenizer:
         *,
         bos: bool,
         eos: bool,
-        allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
-        disallowed_special: Union[Literal["all"], Collection[str]] = (),
-    ) -> List[int]:
+        allowed_special: Literal["all"] | Set[str] | None = None,
+        disallowed_special: Literal["all"] | Collection[str] = (),
+    ) -> list[int]:
         """
         Encodes a string into a list of token IDs.
 
@@ -217,7 +210,7 @@ class Tokenizer:
                 s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
             )
         )
-        t: List[int] = []
+        t: list[int] = []
         for substr in substrs:
             t.extend(
                 self.model.encode(
@@ -243,7 +236,7 @@ class Tokenizer:
             str: The decoded string.
         """
         # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
-        return self.model.decode(cast(List[int], t))
+        return self.model.decode(cast(list[int], t))
 
     @staticmethod
     def _split_whitespaces_or_nonwhitespaces(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
diff --git a/llama_stack/models/llama/llama4/vision/embedding.py b/llama_stack/models/llama/llama4/vision/embedding.py
index ed7659a73..c7dd81965 100644
--- a/llama_stack/models/llama/llama4/vision/embedding.py
+++ b/llama_stack/models/llama/llama4/vision/embedding.py
@@ -5,7 +5,8 @@
 # the root directory of this source tree.
 
 import math
-from typing import Any, Callable, Dict, List
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -136,13 +137,13 @@ class VisionEmbeddings(torch.nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool = True,
-        missing_keys: List[str] = None,
-        unexpected_keys: List[str] = None,
-        error_msgs: List[str] = None,
+        missing_keys: list[str] = None,
+        unexpected_keys: list[str] = None,
+        error_msgs: list[str] = None,
         return_state_dict: bool = False,
     ) -> None:
         original_sd = self.state_dict()
@@ -163,7 +164,7 @@ class VisionEmbeddings(torch.nn.Module):
     # each image is a tensor of shape [num_tiles, C, H, W]
     def forward(
         self,
-        image_batch: List[List[torch.Tensor]],
+        image_batch: list[list[torch.Tensor]],
         image_mask: torch.Tensor,
         h_ref: torch.Tensor,
     ) -> torch.Tensor:
diff --git a/llama_stack/models/llama/llama4/vision/encoder.py b/llama_stack/models/llama/llama4/vision/encoder.py
index 4baf03d8d..4b66f1411 100644
--- a/llama_stack/models/llama/llama4/vision/encoder.py
+++ b/llama_stack/models/llama/llama4/vision/encoder.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from collections.abc import Callable
+from typing import Any
 
 import fairscale.nn.model_parallel.initialize as fs_init
 import torch
@@ -42,9 +43,9 @@ class ColumnParallelConv2dPatch(torch.nn.Module):
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: Union[int, Tuple[int, int]],
-        stride: Union[int, Tuple[int, int]],
-        bias: Optional[bool] = False,
+        kernel_size: int | tuple[int, int],
+        stride: int | tuple[int, int],
+        bias: bool | None = False,
     ) -> None:
         super().__init__()
         if isinstance(kernel_size, int):
@@ -134,15 +135,15 @@ class _TransformerBlock(nn.Module):
     def attention(
         self,
         x: torch.Tensor,
-        freq_cis: Optional[torch.Tensor] = None,
+        freq_cis: torch.Tensor | None = None,
     ):
         return self.attn(x=x, start_pos=0, freqs_cis=freq_cis)
 
     def forward(
         self,
         x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
-        freq_cis: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
+        freq_cis: torch.Tensor | None = None,
     ):
         _gate_attn = 1 if not self.gated else self.gate_attn.tanh()
         _gate_ffn = 1 if not self.gated else self.gate_ffn.tanh()
@@ -210,8 +211,8 @@ class PackingIndex:
 class VisionEncoder(nn.Module):
     def __init__(
         self,
-        image_size: Tuple[int, int],
-        patch_size: Tuple[int, int],
+        image_size: tuple[int, int],
+        patch_size: tuple[int, int],
         dim: int,
         layers: int,
         heads: int,
@@ -299,13 +300,13 @@ class VisionEncoder(nn.Module):
 
     def load_hook(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         prefix: str,
-        local_metadata: Dict[str, Any],
+        local_metadata: dict[str, Any],
         strict: bool = True,
-        missing_keys: List[str] = None,
-        unexpected_keys: List[str] = None,
-        error_msgs: List[str] = None,
+        missing_keys: list[str] = None,
+        unexpected_keys: list[str] = None,
+        error_msgs: list[str] = None,
         return_state_dict: bool = False,
     ) -> None:
         orig_pos_embed = state_dict.get(prefix + "positional_embedding")
diff --git a/llama_stack/models/llama/prompt_format.py b/llama_stack/models/llama/prompt_format.py
index edb34620c..6191df61a 100644
--- a/llama_stack/models/llama/prompt_format.py
+++ b/llama_stack/models/llama/prompt_format.py
@@ -14,7 +14,6 @@
 import json
 import textwrap
 from pathlib import Path
-from typing import List
 
 from pydantic import BaseModel, Field
 
@@ -44,7 +43,7 @@ class TextCompletionContent(BaseModel):
 class UseCase(BaseModel):
     title: str = ""
     description: str = ""
-    dialogs: List[List[RawMessage] | TextCompletionContent | str] = Field(default_factory=list)
+    dialogs: list[list[RawMessage] | TextCompletionContent | str] = Field(default_factory=list)
     notes: str = ""
     tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json
     max_gen_len: int = 512
diff --git a/llama_stack/models/llama/quantize_impls.py b/llama_stack/models/llama/quantize_impls.py
index a5da01588..a6400c5c9 100644
--- a/llama_stack/models/llama/quantize_impls.py
+++ b/llama_stack/models/llama/quantize_impls.py
@@ -7,7 +7,6 @@
 # type: ignore
 import collections
 import logging
-from typing import Optional, Tuple, Type, Union
 
 log = logging.getLogger(__name__)
 
@@ -27,7 +26,7 @@ class Fp8ScaledWeights:
     # TODO: Ugly trick so torch allows us to replace parameters
     # with our custom Fp8Weights instance. Do this properly.
     @property
-    def __class__(self) -> Type[nn.parameter.Parameter]:
+    def __class__(self) -> type[nn.parameter.Parameter]:
         return nn.Parameter
 
     @property
@@ -51,7 +50,7 @@ class Int4ScaledWeights:
     # TODO: Ugly trick so torch allows us to replace parameters
     # with our custom Int4Weights instance. Do this properly.
     @property
-    def __class__(self) -> Type[nn.parameter.Parameter]:
+    def __class__(self) -> type[nn.parameter.Parameter]:
         return nn.Parameter
 
     @property
@@ -74,7 +73,7 @@ class Int4Weights(
 def int4_row_quantize(
     x: torch.Tensor,
     group_size: int = 128,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     n_bit = 4  # Number of target bits.
     to_quant = x.reshape(-1, group_size).to(torch.float)
 
@@ -115,8 +114,8 @@ def pack_int4(x: torch.Tensor) -> torch.Tensor:
 
 def bmm_nt(
     x: Tensor,
-    w: Union[Fp8RowwiseWeights, Int4Weights],
-    num_tokens: Optional[Tensor] = None,
+    w: Fp8RowwiseWeights | Int4Weights,
+    num_tokens: Tensor | None = None,
 ) -> Tensor:
     if isinstance(w, Fp8ScaledWeights):
         xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, w.activation_scale_ub)
@@ -129,10 +128,10 @@ def bmm_nt(
 
 def ffn_swiglu(
     x: Tensor,
-    w1: Union[Fp8RowwiseWeights, Int4Weights],
-    w3: Union[Fp8RowwiseWeights, Int4Weights],
-    w2: Union[Fp8RowwiseWeights, Int4Weights],
-    num_tokens: Optional[Tensor] = None,
+    w1: Fp8RowwiseWeights | Int4Weights,
+    w3: Fp8RowwiseWeights | Int4Weights,
+    w2: Fp8RowwiseWeights | Int4Weights,
+    num_tokens: Tensor | None = None,
     is_memory_bounded: bool = False,
 ) -> Tensor:
     if (isinstance(w1, Fp8ScaledWeights) and isinstance(w3, Fp8ScaledWeights) and isinstance(w2, Fp8ScaledWeights)) or (
@@ -158,7 +157,7 @@ def ffn_swiglu(
 def quantize_fp8(
     w: Tensor,
     fp8_activation_scale_ub: float,
-    output_device: Optional[torch.device] = None,
+    output_device: torch.device | None = None,
 ) -> Fp8RowwiseWeights:
     """Quantize [n, k] weight tensor.
 
@@ -184,7 +183,7 @@ def quantize_fp8(
 @torch.inference_mode()
 def quantize_int4(
     w: Tensor,
-    output_device: Optional[torch.device] = None,
+    output_device: torch.device | None = None,
 ) -> Int4Weights:
     """Quantize [n, k/2] weight tensor.
 
@@ -213,7 +212,7 @@ def load_fp8(
     w: Tensor,
     w_scale: Tensor,
     fp8_activation_scale_ub: float,
-    output_device: Optional[torch.device] = None,
+    output_device: torch.device | None = None,
 ) -> Fp8RowwiseWeights:
     """Load FP8 [n, k] weight tensor.
 
@@ -239,7 +238,7 @@ def load_int4(
     w: Tensor,
     scale: Tensor,
     zero_point: Tensor,
-    output_device: Optional[torch.device] = None,
+    output_device: torch.device | None = None,
 ) -> Int4Weights:
     """Load INT4 [n, k/2] weight tensor.
 
@@ -256,9 +255,9 @@ def load_int4(
 
 def fc_dynamic(
     x: Tensor,
-    w: Union[Fp8RowwiseWeights, Int4Weights],
-    activation_scale_ub: Optional[Tensor] = None,
-    num_tokens: Optional[Tensor] = None,
+    w: Fp8RowwiseWeights | Int4Weights,
+    activation_scale_ub: Tensor | None = None,
+    num_tokens: Tensor | None = None,
     is_memory_bounded: bool = False,
 ) -> Tensor:
     """
@@ -275,11 +274,11 @@ def fc_dynamic(
 
 def ffn_swiglu_dynamic(
     x: Tensor,
-    w1: Union[Fp8RowwiseWeights, Int4Weights],
-    w3: Union[Fp8RowwiseWeights, Int4Weights],
-    w2: Union[Fp8RowwiseWeights, Int4Weights],
-    activation_scale_ub: Optional[Tensor] = None,
-    num_tokens: Optional[Tensor] = None,
+    w1: Fp8RowwiseWeights | Int4Weights,
+    w3: Fp8RowwiseWeights | Int4Weights,
+    w2: Fp8RowwiseWeights | Int4Weights,
+    activation_scale_ub: Tensor | None = None,
+    num_tokens: Tensor | None = None,
     is_memory_bounded: bool = False,
 ) -> Tensor:
     assert x.dim() == 3 or x.dim() == 2
diff --git a/llama_stack/models/llama/sku_list.py b/llama_stack/models/llama/sku_list.py
index 513481831..271cec63f 100644
--- a/llama_stack/models/llama/sku_list.py
+++ b/llama_stack/models/llama/sku_list.py
@@ -6,7 +6,6 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import List, Optional
 
 from .sku_types import (
     CheckpointQuantizationFormat,
@@ -19,14 +18,14 @@ LLAMA2_VOCAB_SIZE = 32000
 LLAMA3_VOCAB_SIZE = 128256
 
 
-def resolve_model(descriptor: str) -> Optional[Model]:
+def resolve_model(descriptor: str) -> Model | None:
     for m in all_registered_models():
         if descriptor in (m.descriptor(), m.huggingface_repo):
             return m
     return None
 
 
-def all_registered_models() -> List[Model]:
+def all_registered_models() -> list[Model]:
     return (
         llama2_family()
         + llama3_family()
@@ -38,48 +37,48 @@ def all_registered_models() -> List[Model]:
     )
 
 
-def llama2_family() -> List[Model]:
+def llama2_family() -> list[Model]:
     return [
         *llama2_base_models(),
         *llama2_instruct_models(),
     ]
 
 
-def llama3_family() -> List[Model]:
+def llama3_family() -> list[Model]:
     return [
         *llama3_base_models(),
         *llama3_instruct_models(),
     ]
 
 
-def llama3_1_family() -> List[Model]:
+def llama3_1_family() -> list[Model]:
     return [
         *llama3_1_base_models(),
         *llama3_1_instruct_models(),
     ]
 
 
-def llama3_2_family() -> List[Model]:
+def llama3_2_family() -> list[Model]:
     return [
         *llama3_2_base_models(),
         *llama3_2_instruct_models(),
     ]
 
 
-def llama3_3_family() -> List[Model]:
+def llama3_3_family() -> list[Model]:
     return [
         *llama3_3_instruct_models(),
     ]
 
 
-def llama4_family() -> List[Model]:
+def llama4_family() -> list[Model]:
     return [
         *llama4_base_models(),
         *llama4_instruct_models(),
     ]
 
 
-def llama4_base_models() -> List[Model]:
+def llama4_base_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama4_scout_17b_16e,
@@ -98,7 +97,7 @@ def llama4_base_models() -> List[Model]:
     ]
 
 
-def llama4_instruct_models() -> List[Model]:
+def llama4_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama4_scout_17b_16e_instruct,
@@ -126,7 +125,7 @@ def llama4_instruct_models() -> List[Model]:
     ]
 
 
-def llama2_base_models() -> List[Model]:
+def llama2_base_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama2_7b,
@@ -185,7 +184,7 @@ def llama2_base_models() -> List[Model]:
     ]
 
 
-def llama3_base_models() -> List[Model]:
+def llama3_base_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_8b,
@@ -226,7 +225,7 @@ def llama3_base_models() -> List[Model]:
     ]
 
 
-def llama3_1_base_models() -> List[Model]:
+def llama3_1_base_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_1_8b,
@@ -324,7 +323,7 @@ def llama3_1_base_models() -> List[Model]:
     ]
 
 
-def llama3_2_base_models() -> List[Model]:
+def llama3_2_base_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_2_1b,
@@ -407,7 +406,7 @@ def llama3_2_base_models() -> List[Model]:
     ]
 
 
-def llama2_instruct_models() -> List[Model]:
+def llama2_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama2_7b_chat,
@@ -466,7 +465,7 @@ def llama2_instruct_models() -> List[Model]:
     ]
 
 
-def llama3_instruct_models() -> List[Model]:
+def llama3_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_8b_instruct,
@@ -507,7 +506,7 @@ def llama3_instruct_models() -> List[Model]:
     ]
 
 
-def llama3_1_instruct_models() -> List[Model]:
+def llama3_1_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_1_8b_instruct,
@@ -635,7 +634,7 @@ def arch_args_3b() -> dict:
     }
 
 
-def llama3_2_quantized_models() -> List[Model]:
+def llama3_2_quantized_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_2_1b_instruct,
@@ -704,7 +703,7 @@ def llama3_2_quantized_models() -> List[Model]:
     ]
 
 
-def llama3_2_instruct_models() -> List[Model]:
+def llama3_2_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_2_1b_instruct,
@@ -766,7 +765,7 @@ def llama3_2_instruct_models() -> List[Model]:
     ]
 
 
-def llama3_3_instruct_models() -> List[Model]:
+def llama3_3_instruct_models() -> list[Model]:
     return [
         Model(
             core_model_id=CoreModelId.llama3_3_70b_instruct,
@@ -790,8 +789,15 @@ def llama3_3_instruct_models() -> List[Model]:
 
 
 @lru_cache
-def safety_models() -> List[Model]:
+def safety_models() -> list[Model]:
     return [
+        Model(
+            core_model_id=CoreModelId.llama_guard_4_12b,
+            description="Llama Guard v4 12b system safety model",
+            huggingface_repo="meta-llama/Llama-Guard-4-12B",
+            arch_args={},
+            pth_file_count=1,
+        ),
         Model(
             core_model_id=CoreModelId.llama_guard_3_11b_vision,
             description="Llama Guard v3 11b vision system safety model",
@@ -912,7 +918,7 @@ def safety_models() -> List[Model]:
 @dataclass
 class LlamaDownloadInfo:
     folder: str
-    files: List[str]
+    files: list[str]
     pth_size: int
 
 
@@ -942,6 +948,8 @@ def llama_meta_net_info(model: Model) -> LlamaDownloadInfo:
     elif model.core_model_id == CoreModelId.llama_guard_2_8b:
         folder = "llama-guard-2"
     else:
+        if model.huggingface_repo is None:
+            raise ValueError(f"Model {model.core_model_id} has no huggingface_repo set")
         folder = model.huggingface_repo.split("/")[-1]
         if "Llama-2" in folder:
             folder = folder.lower()
@@ -1018,3 +1026,4 @@ def llama_meta_pth_size(model: Model) -> int:
                 return 54121549657
             else:
                 return 100426653046
+    return 0
diff --git a/llama_stack/models/llama/sku_types.py b/llama_stack/models/llama/sku_types.py
index 88799b66d..4147707d5 100644
--- a/llama_stack/models/llama/sku_types.py
+++ b/llama_stack/models/llama/sku_types.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, ConfigDict, Field
 
@@ -81,6 +81,7 @@ class CoreModelId(Enum):
     llama_guard_2_8b = "Llama-Guard-2-8B"
     llama_guard_3_11b_vision = "Llama-Guard-3-11B-Vision"
     llama_guard_3_1b = "Llama-Guard-3-1B"
+    llama_guard_4_12b = "Llama-Guard-4-12B"
 
 
 def is_multimodal(model_id) -> bool:
@@ -148,6 +149,7 @@ def model_family(model_id) -> ModelFamily:
         CoreModelId.llama_guard_2_8b,
         CoreModelId.llama_guard_3_11b_vision,
         CoreModelId.llama_guard_3_1b,
+        CoreModelId.llama_guard_4_12b,
     ]:
         return ModelFamily.safety
     else:
@@ -157,13 +159,13 @@ def model_family(model_id) -> ModelFamily:
 class Model(BaseModel):
     core_model_id: CoreModelId
     description: str
-    huggingface_repo: Optional[str] = None
-    arch_args: Dict[str, Any]
+    huggingface_repo: str | None = None
+    arch_args: dict[str, Any]
     variant: str = ""
 
     quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
     pth_file_count: int
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
     # silence pydantic until we remove the `model_` fields
     model_config = ConfigDict(protected_namespaces=())
@@ -225,5 +227,7 @@ class Model(BaseModel):
             CoreModelId.llama_guard_3_1b,
         ]:
             return 131072
+        elif self.core_model_id == CoreModelId.llama_guard_4_12b:
+            return 8192
         else:
             raise ValueError(f"Unknown max_seq_len for {self.core_model_id}")
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index c3141f807..60b05545b 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, List, Optional, Protocol
+from typing import Any, Protocol
 from urllib.parse import urlparse
 
 from pydantic import BaseModel, Field
@@ -16,12 +16,33 @@ from llama_stack.apis.datatypes import Api
 from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
-from llama_stack.apis.tools import Tool
+from llama_stack.apis.tools import ToolGroup
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.schema_utils import json_schema_type
 
 
 class ModelsProtocolPrivate(Protocol):
+    """
+    Protocol for model management.
+
+    This allows users to register their preferred model identifiers.
+
+    Model registration requires -
+     - a provider, used to route the registration request
+     - a model identifier, user's intended name for the model during inference
+     - a provider model identifier, a model identifier supported by the provider
+
+    Providers will only accept registration for provider model ids they support.
+
+    Example,
+      register: provider x my-model-id x provider-model-id
+       -> Error if provider does not support provider-model-id
+       -> Error if my-model-id is already registered
+       -> Success if provider supports provider-model-id
+      inference: my-model-id x ...
+       -> Provider uses provider-model-id for inference
+    """
+
     async def register_model(self, model: Model) -> Model: ...
 
     async def unregister_model(self, model_id: str) -> None: ...
@@ -44,7 +65,7 @@ class DatasetsProtocolPrivate(Protocol):
 
 
 class ScoringFunctionsProtocolPrivate(Protocol):
-    async def list_scoring_functions(self) -> List[ScoringFn]: ...
+    async def list_scoring_functions(self) -> list[ScoringFn]: ...
 
     async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
 
@@ -53,10 +74,10 @@ class BenchmarksProtocolPrivate(Protocol):
     async def register_benchmark(self, benchmark: Benchmark) -> None: ...
 
 
-class ToolsProtocolPrivate(Protocol):
-    async def register_tool(self, tool: Tool) -> None: ...
+class ToolGroupsProtocolPrivate(Protocol):
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None: ...
 
-    async def unregister_tool(self, tool_id: str) -> None: ...
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None: ...
 
 
 @json_schema_type
@@ -67,24 +88,24 @@ class ProviderSpec(BaseModel):
         ...,
         description="Fully-qualified classname of the config for this provider",
     )
-    api_dependencies: List[Api] = Field(
+    api_dependencies: list[Api] = Field(
         default_factory=list,
         description="Higher-level API surfaces may depend on other providers to provide their functionality",
     )
-    optional_api_dependencies: List[Api] = Field(
+    optional_api_dependencies: list[Api] = Field(
         default_factory=list,
     )
-    deprecation_warning: Optional[str] = Field(
+    deprecation_warning: str | None = Field(
         default=None,
         description="If this provider is deprecated, specify the warning message here",
     )
-    deprecation_error: Optional[str] = Field(
+    deprecation_error: str | None = Field(
         default=None,
         description="If this provider is deprecated and does NOT work, specify the error message here",
     )
 
     # used internally by the resolver; this is a hack for now
-    deps__: List[str] = Field(default_factory=list)
+    deps__: list[str] = Field(default_factory=list)
 
     @property
     def is_sample(self) -> bool:
@@ -110,25 +131,25 @@ Fully-qualified name of the module to import. The module is expected to have:
  - `get_adapter_impl(config, deps)`: returns the adapter implementation
 """,
     )
-    pip_packages: List[str] = Field(
+    pip_packages: list[str] = Field(
         default_factory=list,
         description="The pip dependencies needed for this implementation",
     )
     config_class: str = Field(
         description="Fully-qualified classname of the config for this provider",
     )
-    provider_data_validator: Optional[str] = Field(
+    provider_data_validator: str | None = Field(
         default=None,
     )
 
 
 @json_schema_type
 class InlineProviderSpec(ProviderSpec):
-    pip_packages: List[str] = Field(
+    pip_packages: list[str] = Field(
         default_factory=list,
         description="The pip dependencies needed for this implementation",
     )
-    container_image: Optional[str] = Field(
+    container_image: str | None = Field(
         default=None,
         description="""
 The container image to use for this implementation. If one is provided, pip_packages will be ignored.
@@ -143,14 +164,14 @@ Fully-qualified name of the module to import. The module is expected to have:
  - `get_provider_impl(config, deps)`: returns the local implementation
 """,
     )
-    provider_data_validator: Optional[str] = Field(
+    provider_data_validator: str | None = Field(
         default=None,
     )
 
 
 class RemoteProviderConfig(BaseModel):
     host: str = "localhost"
-    port: Optional[int] = None
+    port: int | None = None
     protocol: str = "http"
 
     @property
@@ -176,7 +197,7 @@ API responses, specify the adapter here.
     )
 
     @property
-    def container_image(self) -> Optional[str]:
+    def container_image(self) -> str | None:
         return None
 
     @property
@@ -184,16 +205,16 @@ API responses, specify the adapter here.
         return self.adapter.module
 
     @property
-    def pip_packages(self) -> List[str]:
+    def pip_packages(self) -> list[str]:
         return self.adapter.pip_packages
 
     @property
-    def provider_data_validator(self) -> Optional[str]:
+    def provider_data_validator(self) -> str | None:
         return self.adapter.provider_data_validator
 
 
 def remote_provider_spec(
-    api: Api, adapter: AdapterSpec, api_dependencies: Optional[List[Api]] = None
+    api: Api, adapter: AdapterSpec, api_dependencies: list[Api] | None = None
 ) -> RemoteProviderSpec:
     return RemoteProviderSpec(
         api=api,
diff --git a/llama_stack/providers/inline/agents/meta_reference/__init__.py b/llama_stack/providers/inline/agents/meta_reference/__init__.py
index 4be064f1d..7503b8c90 100644
--- a/llama_stack/providers/inline/agents/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/agents/meta_reference/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
 from .config import MetaReferenceAgentsImplConfig
 
 
-async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Api, Any]):
     from .agents import MetaReferenceAgentsImpl
 
     impl = MetaReferenceAgentsImpl(
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index b5714b438..2e387e7e8 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -10,8 +10,8 @@ import re
 import secrets
 import string
 import uuid
+from collections.abc import AsyncGenerator
 from datetime import datetime, timezone
-from typing import AsyncGenerator, List, Optional, Union
 
 import httpx
 
@@ -95,6 +95,7 @@ class ChatAgent(ShieldRunnerMixin):
         tool_groups_api: ToolGroups,
         vector_io_api: VectorIO,
         persistence_store: KVStore,
+        created_at: str,
     ):
         self.agent_id = agent_id
         self.agent_config = agent_config
@@ -104,6 +105,7 @@ class ChatAgent(ShieldRunnerMixin):
         self.storage = AgentPersistence(agent_id, persistence_store)
         self.tool_runtime_api = tool_runtime_api
         self.tool_groups_api = tool_groups_api
+        self.created_at = created_at
 
         ShieldRunnerMixin.__init__(
             self,
@@ -112,7 +114,7 @@ class ChatAgent(ShieldRunnerMixin):
             output_shields=agent_config.output_shields,
         )
 
-    def turn_to_messages(self, turn: Turn) -> List[Message]:
+    def turn_to_messages(self, turn: Turn) -> list[Message]:
         messages = []
 
         # NOTE: if a toolcall response is in a step, we do not add it when processing the input messages
@@ -161,7 +163,7 @@ class ChatAgent(ShieldRunnerMixin):
     async def create_session(self, name: str) -> str:
         return await self.storage.create_session(name)
 
-    async def get_messages_from_turns(self, turns: List[Turn]) -> List[Message]:
+    async def get_messages_from_turns(self, turns: list[Turn]) -> list[Message]:
         messages = []
         if self.agent_config.instructions != "":
             messages.append(SystemMessage(content=self.agent_config.instructions))
@@ -201,8 +203,8 @@ class ChatAgent(ShieldRunnerMixin):
 
     async def _run_turn(
         self,
-        request: Union[AgentTurnCreateRequest, AgentTurnResumeRequest],
-        turn_id: Optional[str] = None,
+        request: AgentTurnCreateRequest | AgentTurnResumeRequest,
+        turn_id: str | None = None,
     ) -> AsyncGenerator:
         assert request.stream is True, "Non-streaming not supported"
 
@@ -321,10 +323,10 @@ class ChatAgent(ShieldRunnerMixin):
         self,
         session_id: str,
         turn_id: str,
-        input_messages: List[Message],
+        input_messages: list[Message],
         sampling_params: SamplingParams,
         stream: bool = False,
-        documents: Optional[List[Document]] = None,
+        documents: list[Document] | None = None,
     ) -> AsyncGenerator:
         # Doing async generators makes downstream code much simpler and everything amenable to
         # streaming. However, it also makes things complicated here because AsyncGenerators cannot
@@ -374,8 +376,8 @@ class ChatAgent(ShieldRunnerMixin):
     async def run_multiple_shields_wrapper(
         self,
         turn_id: str,
-        messages: List[Message],
-        shields: List[str],
+        messages: list[Message],
+        shields: list[str],
         touchpoint: str,
     ) -> AsyncGenerator:
         async with tracing.span("run_shields") as span:
@@ -443,10 +445,10 @@ class ChatAgent(ShieldRunnerMixin):
         self,
         session_id: str,
         turn_id: str,
-        input_messages: List[Message],
+        input_messages: list[Message],
         sampling_params: SamplingParams,
         stream: bool = False,
-        documents: Optional[List[Document]] = None,
+        documents: list[Document] | None = None,
     ) -> AsyncGenerator:
         # if document is passed in a turn, we parse the raw text of the document
         # and sent it as a user message
@@ -760,7 +762,7 @@ class ChatAgent(ShieldRunnerMixin):
 
     async def _initialize_tools(
         self,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
+        toolgroups_for_turn: list[AgentToolGroup] | None = None,
     ) -> None:
         toolgroup_to_args = {}
         for toolgroup in (self.agent_config.toolgroups or []) + (toolgroups_for_turn or []):
@@ -847,7 +849,7 @@ class ChatAgent(ShieldRunnerMixin):
             tool_name_to_args,
         )
 
-    def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
+    def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, str | None]:
         """Parse a toolgroup name into its components.
 
         Args:
@@ -921,7 +923,7 @@ async def get_raw_document_text(document: Document) -> str:
 
 def _interpret_content_as_attachment(
     content: str,
-) -> Optional[Attachment]:
+) -> Attachment | None:
     match = re.search(TOOLS_ATTACHMENT_KEY_REGEX, content)
     if match:
         snippet = match.group(1)
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index 656178773..bcbfcbe31 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import json
 import logging
-import shutil
 import uuid
-from typing import AsyncGenerator, List, Optional, Union
+from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
 
 from llama_stack.apis.agents import (
     Agent,
@@ -21,11 +20,16 @@ from llama_stack.apis.agents import (
     AgentTurnCreateRequest,
     AgentTurnResumeRequest,
     Document,
-    ListAgentSessionsResponse,
-    ListAgentsResponse,
+    ListOpenAIResponseInputItem,
+    ListOpenAIResponseObject,
+    OpenAIResponseInput,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    Order,
     Session,
     Turn,
 )
+from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
     Inference,
     ToolConfig,
@@ -37,12 +41,15 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
+from llama_stack.providers.utils.responses.responses_store import ResponsesStore
 
 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
+from .openai_responses import OpenAIResponsesImpl
+from .persistence import AgentInfo
 
 logger = logging.getLogger()
-logger.setLevel(logging.INFO)
 
 
 class MetaReferenceAgentsImpl(Agents):
@@ -63,56 +70,65 @@ class MetaReferenceAgentsImpl(Agents):
         self.tool_groups_api = tool_groups_api
 
         self.in_memory_store = InmemoryKVStoreImpl()
+        self.openai_responses_impl: OpenAIResponsesImpl | None = None
 
     async def initialize(self) -> None:
         self.persistence_store = await kvstore_impl(self.config.persistence_store)
-
-        # check if "bwrap" is available
-        if not shutil.which("bwrap"):
-            logger.warning("Warning: `bwrap` is not available. Code interpreter tool will not work correctly.")
+        self.responses_store = ResponsesStore(self.config.responses_store)
+        await self.responses_store.initialize()
+        self.openai_responses_impl = OpenAIResponsesImpl(
+            inference_api=self.inference_api,
+            tool_groups_api=self.tool_groups_api,
+            tool_runtime_api=self.tool_runtime_api,
+            responses_store=self.responses_store,
+        )
 
     async def create_agent(
         self,
         agent_config: AgentConfig,
     ) -> AgentCreateResponse:
         agent_id = str(uuid.uuid4())
+        created_at = datetime.now(timezone.utc)
 
+        agent_info = AgentInfo(
+            **agent_config.model_dump(),
+            created_at=created_at,
+        )
+
+        # Store the agent info
         await self.persistence_store.set(
             key=f"agent:{agent_id}",
-            value=agent_config.model_dump_json(),
+            value=agent_info.model_dump_json(),
         )
+
         return AgentCreateResponse(
             agent_id=agent_id,
         )
 
     async def _get_agent_impl(self, agent_id: str) -> ChatAgent:
-        agent_config = await self.persistence_store.get(
+        agent_info_json = await self.persistence_store.get(
             key=f"agent:{agent_id}",
         )
-        if not agent_config:
-            raise ValueError(f"Could not find agent config for {agent_id}")
+        if not agent_info_json:
+            raise ValueError(f"Could not find agent info for {agent_id}")
 
         try:
-            agent_config = json.loads(agent_config)
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Could not JSON decode agent config for {agent_id}") from e
-
-        try:
-            agent_config = AgentConfig(**agent_config)
+            agent_info = AgentInfo.model_validate_json(agent_info_json)
         except Exception as e:
-            raise ValueError(f"Could not validate(?) agent config for {agent_id}") from e
+            raise ValueError(f"Could not validate agent info for {agent_id}") from e
 
         return ChatAgent(
             agent_id=agent_id,
-            agent_config=agent_config,
+            agent_config=agent_info,
             inference_api=self.inference_api,
             safety_api=self.safety_api,
             vector_io_api=self.vector_io_api,
             tool_runtime_api=self.tool_runtime_api,
             tool_groups_api=self.tool_groups_api,
             persistence_store=(
-                self.persistence_store if agent_config.enable_session_persistence else self.in_memory_store
+                self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
             ),
+            created_at=agent_info.created_at,
         )
 
     async def create_agent_session(
@@ -131,16 +147,11 @@ class MetaReferenceAgentsImpl(Agents):
         self,
         agent_id: str,
         session_id: str,
-        messages: List[
-            Union[
-                UserMessage,
-                ToolResponseMessage,
-            ]
-        ],
-        toolgroups: Optional[List[AgentToolGroup]] = None,
-        documents: Optional[List[Document]] = None,
-        stream: Optional[bool] = False,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[UserMessage | ToolResponseMessage],
+        toolgroups: list[AgentToolGroup] | None = None,
+        documents: list[Document] | None = None,
+        stream: bool | None = False,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         request = AgentTurnCreateRequest(
             agent_id=agent_id,
@@ -169,8 +180,8 @@ class MetaReferenceAgentsImpl(Agents):
         agent_id: str,
         session_id: str,
         turn_id: str,
-        tool_responses: List[ToolResponse],
-        stream: Optional[bool] = False,
+        tool_responses: list[ToolResponse],
+        stream: bool | None = False,
     ) -> AsyncGenerator:
         request = AgentTurnResumeRequest(
             agent_id=agent_id,
@@ -208,9 +219,10 @@ class MetaReferenceAgentsImpl(Agents):
         self,
         agent_id: str,
         session_id: str,
-        turn_ids: Optional[List[str]] = None,
+        turn_ids: list[str] | None = None,
     ) -> Session:
         agent = await self._get_agent_impl(agent_id)
+
         session_info = await agent.storage.get_session_info(session_id)
         if session_info is None:
             raise ValueError(f"Session {session_id} not found")
@@ -225,22 +237,117 @@ class MetaReferenceAgentsImpl(Agents):
         )
 
     async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
-        await self.persistence_store.delete(f"session:{agent_id}:{session_id}")
+        agent = await self._get_agent_impl(agent_id)
+        session_info = await agent.storage.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        # Delete turns first, then the session
+        await agent.storage.delete_session_turns(session_id)
+        await agent.storage.delete_session(session_id)
 
     async def delete_agent(self, agent_id: str) -> None:
+        # First get all sessions for this agent
+        agent = await self._get_agent_impl(agent_id)
+        sessions = await agent.storage.list_sessions()
+
+        # Delete all sessions
+        for session in sessions:
+            await self.delete_agents_session(agent_id, session.session_id)
+
+        # Finally delete the agent itself
         await self.persistence_store.delete(f"agent:{agent_id}")
 
+    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
+        agent_keys = await self.persistence_store.keys_in_range("agent:", "agent:\xff")
+        agent_list: list[Agent] = []
+        for agent_key in agent_keys:
+            agent_id = agent_key.split(":")[1]
+
+            # Get the agent info using the key
+            agent_info_json = await self.persistence_store.get(agent_key)
+            if not agent_info_json:
+                logger.error(f"Could not find agent info for key {agent_key}")
+                continue
+
+            try:
+                agent_info = AgentInfo.model_validate_json(agent_info_json)
+                agent_list.append(
+                    Agent(
+                        agent_id=agent_id,
+                        agent_config=agent_info,
+                        created_at=agent_info.created_at,
+                    )
+                )
+            except Exception as e:
+                logger.error(f"Error parsing agent info for {agent_id}: {e}")
+                continue
+
+        # Convert Agent objects to dictionaries
+        agent_dicts = [agent.model_dump() for agent in agent_list]
+        return paginate_records(agent_dicts, start_index, limit)
+
+    async def get_agent(self, agent_id: str) -> Agent:
+        chat_agent = await self._get_agent_impl(agent_id)
+        agent = Agent(
+            agent_id=agent_id,
+            agent_config=chat_agent.agent_config,
+            created_at=chat_agent.created_at,
+        )
+        return agent
+
+    async def list_agent_sessions(
+        self, agent_id: str, start_index: int | None = None, limit: int | None = None
+    ) -> PaginatedResponse:
+        agent = await self._get_agent_impl(agent_id)
+        sessions = await agent.storage.list_sessions()
+        # Convert Session objects to dictionaries
+        session_dicts = [session.model_dump() for session in sessions]
+        return paginate_records(session_dicts, start_index, limit)
+
     async def shutdown(self) -> None:
         pass
 
-    async def list_agents(self) -> ListAgentsResponse:
-        pass
-
-    async def get_agent(self, agent_id: str) -> Agent:
-        pass
-
-    async def list_agent_sessions(
+    # OpenAI responses
+    async def get_openai_response(
         self,
-        agent_id: str,
-    ) -> ListAgentSessionsResponse:
-        pass
+        response_id: str,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.get_openai_response(response_id)
+
+    async def create_openai_response(
+        self,
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        instructions: str | None = None,
+        previous_response_id: str | None = None,
+        store: bool | None = True,
+        stream: bool | None = False,
+        temperature: float | None = None,
+        tools: list[OpenAIResponseInputTool] | None = None,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.create_openai_response(
+            input, model, instructions, previous_response_id, store, stream, temperature, tools
+        )
+
+    async def list_openai_responses(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseObject:
+        return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)
+
+    async def list_openai_response_input_items(
+        self,
+        response_id: str,
+        after: str | None = None,
+        before: str | None = None,
+        include: list[str] | None = None,
+        limit: int | None = 20,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseInputItem:
+        return await self.openai_responses_impl.list_openai_response_input_items(
+            response_id, after, before, include, limit, order
+        )
diff --git a/llama_stack/providers/inline/agents/meta_reference/config.py b/llama_stack/providers/inline/agents/meta_reference/config.py
index ff34e5d5f..1c392f29c 100644
--- a/llama_stack/providers/inline/agents/meta_reference/config.py
+++ b/llama_stack/providers/inline/agents/meta_reference/config.py
@@ -4,22 +4,28 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 from llama_stack.providers.utils.kvstore import KVStoreConfig
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig
 
 
 class MetaReferenceAgentsImplConfig(BaseModel):
     persistence_store: KVStoreConfig
+    responses_store: SqlStoreConfig
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
         return {
             "persistence_store": SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=__distro_dir__,
                 db_name="agents_store.db",
-            )
+            ),
+            "responses_store": SqliteSqlStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="responses_store.db",
+            ),
         }
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
new file mode 100644
index 000000000..19d7ea56f
--- /dev/null
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -0,0 +1,776 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import time
+import uuid
+from collections.abc import AsyncIterator
+from typing import Any, cast
+
+from openai.types.chat import ChatCompletionToolParam
+from pydantic import BaseModel
+
+from llama_stack.apis.agents import Order
+from llama_stack.apis.agents.openai_responses import (
+    AllowedToolsFilter,
+    ListOpenAIResponseInputItem,
+    ListOpenAIResponseObject,
+    OpenAIResponseInput,
+    OpenAIResponseInputFunctionToolCallOutput,
+    OpenAIResponseInputMessageContent,
+    OpenAIResponseInputMessageContentImage,
+    OpenAIResponseInputMessageContentText,
+    OpenAIResponseInputTool,
+    OpenAIResponseInputToolMCP,
+    OpenAIResponseMessage,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+    OpenAIResponseObjectStreamResponseCompleted,
+    OpenAIResponseObjectStreamResponseCreated,
+    OpenAIResponseObjectStreamResponseOutputTextDelta,
+    OpenAIResponseOutput,
+    OpenAIResponseOutputMessageContent,
+    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageFunctionToolCall,
+    OpenAIResponseOutputMessageMCPListTools,
+    OpenAIResponseOutputMessageWebSearchToolCall,
+)
+from llama_stack.apis.inference.inference import (
+    Inference,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIDeveloperMessageParam,
+    OpenAIImageURL,
+    OpenAIMessageParam,
+    OpenAISystemMessageParam,
+    OpenAIToolMessageParam,
+    OpenAIUserMessageParam,
+)
+from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime
+from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
+from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
+from llama_stack.providers.utils.responses.responses_store import ResponsesStore
+from llama_stack.providers.utils.tools.mcp import invoke_mcp_tool, list_mcp_tools
+
+logger = get_logger(name=__name__, category="openai_responses")
+
+OPENAI_RESPONSES_PREFIX = "openai_responses:"
+
+
+async def _convert_response_content_to_chat_content(
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent],
+) -> str | list[OpenAIChatCompletionContentPartParam]:
+    """
+    Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts.
+
+    The content schemas of each API look similar, but are not exactly the same.
+    """
+    if isinstance(content, str):
+        return content
+
+    converted_parts = []
+    for content_part in content:
+        if isinstance(content_part, OpenAIResponseInputMessageContentText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseInputMessageContentImage):
+            if content_part.image_url:
+                image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail)
+                converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
+        elif isinstance(content_part, str):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part))
+        else:
+            raise ValueError(
+                f"Llama Stack OpenAI Responses does not yet support content type '{type(content_part)}' in this context"
+            )
+    return converted_parts
+
+
+async def _convert_response_input_to_chat_messages(
+    input: str | list[OpenAIResponseInput],
+) -> list[OpenAIMessageParam]:
+    """
+    Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages.
+    """
+    messages: list[OpenAIMessageParam] = []
+    if isinstance(input, list):
+        for input_item in input:
+            if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput):
+                messages.append(
+                    OpenAIToolMessageParam(
+                        content=input_item.output,
+                        tool_call_id=input_item.call_id,
+                    )
+                )
+            elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall):
+                tool_call = OpenAIChatCompletionToolCall(
+                    index=0,
+                    id=input_item.call_id,
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=input_item.name,
+                        arguments=input_item.arguments,
+                    ),
+                )
+                messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+            else:
+                content = await _convert_response_content_to_chat_content(input_item.content)
+                message_type = await _get_message_type_by_role(input_item.role)
+                if message_type is None:
+                    raise ValueError(
+                        f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context"
+                    )
+                messages.append(message_type(content=content))
+    else:
+        messages.append(OpenAIUserMessageParam(content=input))
+    return messages
+
+
+async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage:
+    """
+    Convert an OpenAI Chat Completion choice into an OpenAI Response output message.
+    """
+    output_content = ""
+    if isinstance(choice.message.content, str):
+        output_content = choice.message.content
+    elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
+        output_content = choice.message.content.text
+    else:
+        raise ValueError(
+            f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}"
+        )
+
+    return OpenAIResponseMessage(
+        id=f"msg_{uuid.uuid4()}",
+        content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
+        status="completed",
+        role="assistant",
+    )
+
+
+async def _get_message_type_by_role(role: str):
+    role_to_type = {
+        "user": OpenAIUserMessageParam,
+        "system": OpenAISystemMessageParam,
+        "assistant": OpenAIAssistantMessageParam,
+        "developer": OpenAIDeveloperMessageParam,
+    }
+    return role_to_type.get(role)
+
+
+class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
+    input_items: ListOpenAIResponseInputItem
+    response: OpenAIResponseObject
+
+
+class ChatCompletionContext(BaseModel):
+    model: str
+    messages: list[OpenAIMessageParam]
+    tools: list[ChatCompletionToolParam] | None = None
+    mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
+    stream: bool
+    temperature: float | None
+
+
+class OpenAIResponsesImpl:
+    def __init__(
+        self,
+        inference_api: Inference,
+        tool_groups_api: ToolGroups,
+        tool_runtime_api: ToolRuntime,
+        responses_store: ResponsesStore,
+    ):
+        self.inference_api = inference_api
+        self.tool_groups_api = tool_groups_api
+        self.tool_runtime_api = tool_runtime_api
+        self.responses_store = responses_store
+
+    async def _prepend_previous_response(
+        self, input: str | list[OpenAIResponseInput], previous_response_id: str | None = None
+    ):
+        if previous_response_id:
+            previous_response_with_input = await self.responses_store.get_response_object(previous_response_id)
+
+            # previous response input items
+            new_input_items = previous_response_with_input.input
+
+            # previous response output items
+            new_input_items.extend(previous_response_with_input.output)
+
+            # new input items from the current request
+            if isinstance(input, str):
+                new_input_items.append(OpenAIResponseMessage(content=input, role="user"))
+            else:
+                new_input_items.extend(input)
+
+            input = new_input_items
+
+        return input
+
+    async def _prepend_instructions(self, messages, instructions):
+        if instructions:
+            messages.insert(0, OpenAISystemMessageParam(content=instructions))
+
+    async def get_openai_response(
+        self,
+        response_id: str,
+    ) -> OpenAIResponseObject:
+        response_with_input = await self.responses_store.get_response_object(response_id)
+        return OpenAIResponseObject(**{k: v for k, v in response_with_input.model_dump().items() if k != "input"})
+
+    async def list_openai_responses(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseObject:
+        return await self.responses_store.list_responses(after, limit, model, order)
+
+    async def list_openai_response_input_items(
+        self,
+        response_id: str,
+        after: str | None = None,
+        before: str | None = None,
+        include: list[str] | None = None,
+        limit: int | None = 20,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseInputItem:
+        """List input items for a given OpenAI response.
+
+        :param response_id: The ID of the response to retrieve input items for.
+        :param after: An item ID to list items after, used for pagination.
+        :param before: An item ID to list items before, used for pagination.
+        :param include: Additional fields to include in the response.
+        :param limit: A limit on the number of objects to be returned.
+        :param order: The order to return the input items in.
+        :returns: An ListOpenAIResponseInputItem.
+        """
+        return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
+
+    async def _process_response_choices(
+        self,
+        chat_response: OpenAIChatCompletion,
+        ctx: ChatCompletionContext,
+        tools: list[OpenAIResponseInputTool] | None,
+    ) -> list[OpenAIResponseOutput]:
+        """Handle tool execution and response message creation."""
+        output_messages: list[OpenAIResponseOutput] = []
+        # Execute tool calls if any
+        for choice in chat_response.choices:
+            if choice.message.tool_calls and tools:
+                # Assume if the first tool is a function, all tools are functions
+                if tools[0].type == "function":
+                    for tool_call in choice.message.tool_calls:
+                        output_messages.append(
+                            OpenAIResponseOutputMessageFunctionToolCall(
+                                arguments=tool_call.function.arguments or "",
+                                call_id=tool_call.id,
+                                name=tool_call.function.name or "",
+                                id=f"fc_{uuid.uuid4()}",
+                                status="completed",
+                            )
+                        )
+                else:
+                    tool_messages = await self._execute_tool_and_return_final_output(choice, ctx)
+                    output_messages.extend(tool_messages)
+            else:
+                output_messages.append(await _convert_chat_choice_to_response_message(choice))
+
+        return output_messages
+
+    async def _store_response(
+        self,
+        response: OpenAIResponseObject,
+        input: str | list[OpenAIResponseInput],
+    ) -> None:
+        new_input_id = f"msg_{uuid.uuid4()}"
+        if isinstance(input, str):
+            # synthesize a message from the input string
+            input_content = OpenAIResponseInputMessageContentText(text=input)
+            input_content_item = OpenAIResponseMessage(
+                role="user",
+                content=[input_content],
+                id=new_input_id,
+            )
+            input_items_data = [input_content_item]
+        else:
+            # we already have a list of messages
+            input_items_data = []
+            for input_item in input:
+                if isinstance(input_item, OpenAIResponseMessage):
+                    # These may or may not already have an id, so dump to dict, check for id, and add if missing
+                    input_item_dict = input_item.model_dump()
+                    if "id" not in input_item_dict:
+                        input_item_dict["id"] = new_input_id
+                    input_items_data.append(OpenAIResponseMessage(**input_item_dict))
+                else:
+                    input_items_data.append(input_item)
+
+        await self.responses_store.store_response_object(
+            response_object=response,
+            input=input_items_data,
+        )
+
+    async def create_openai_response(
+        self,
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        instructions: str | None = None,
+        previous_response_id: str | None = None,
+        store: bool | None = True,
+        stream: bool | None = False,
+        temperature: float | None = None,
+        tools: list[OpenAIResponseInputTool] | None = None,
+    ):
+        stream = False if stream is None else stream
+
+        output_messages: list[OpenAIResponseOutput] = []
+
+        # Input preprocessing
+        input = await self._prepend_previous_response(input, previous_response_id)
+        messages = await _convert_response_input_to_chat_messages(input)
+        await self._prepend_instructions(messages, instructions)
+
+        # Tool setup
+        chat_tools, mcp_tool_to_server, mcp_list_message = (
+            await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
+        )
+        if mcp_list_message:
+            output_messages.append(mcp_list_message)
+
+        ctx = ChatCompletionContext(
+            model=model,
+            messages=messages,
+            tools=chat_tools,
+            mcp_tool_to_server=mcp_tool_to_server,
+            stream=stream,
+            temperature=temperature,
+        )
+
+        inference_result = await self.inference_api.openai_chat_completion(
+            model=model,
+            messages=messages,
+            tools=chat_tools,
+            stream=stream,
+            temperature=temperature,
+        )
+
+        if stream:
+            return self._create_streaming_response(
+                inference_result=inference_result,
+                ctx=ctx,
+                output_messages=output_messages,
+                input=input,
+                model=model,
+                store=store,
+                tools=tools,
+            )
+        else:
+            return await self._create_non_streaming_response(
+                inference_result=inference_result,
+                ctx=ctx,
+                output_messages=output_messages,
+                input=input,
+                model=model,
+                store=store,
+                tools=tools,
+            )
+
+    async def _create_non_streaming_response(
+        self,
+        inference_result: Any,
+        ctx: ChatCompletionContext,
+        output_messages: list[OpenAIResponseOutput],
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        store: bool | None,
+        tools: list[OpenAIResponseInputTool] | None,
+    ) -> OpenAIResponseObject:
+        chat_response = OpenAIChatCompletion(**inference_result.model_dump())
+
+        # Process response choices (tool execution and message creation)
+        output_messages.extend(
+            await self._process_response_choices(
+                chat_response=chat_response,
+                ctx=ctx,
+                tools=tools,
+            )
+        )
+
+        response = OpenAIResponseObject(
+            created_at=chat_response.created,
+            id=f"resp-{uuid.uuid4()}",
+            model=model,
+            object="response",
+            status="completed",
+            output=output_messages,
+        )
+        logger.debug(f"OpenAI Responses response: {response}")
+
+        # Store response if requested
+        if store:
+            await self._store_response(
+                response=response,
+                input=input,
+            )
+
+        return response
+
+    async def _create_streaming_response(
+        self,
+        inference_result: Any,
+        ctx: ChatCompletionContext,
+        output_messages: list[OpenAIResponseOutput],
+        input: str | list[OpenAIResponseInput],
+        model: str,
+        store: bool | None,
+        tools: list[OpenAIResponseInputTool] | None,
+    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        # Create initial response and emit response.created immediately
+        response_id = f"resp-{uuid.uuid4()}"
+        created_at = int(time.time())
+
+        initial_response = OpenAIResponseObject(
+            created_at=created_at,
+            id=response_id,
+            model=model,
+            object="response",
+            status="in_progress",
+            output=output_messages.copy(),
+        )
+
+        # Emit response.created immediately
+        yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
+
+        # For streaming, inference_result is an async iterator of chunks
+        # Stream chunks and emit delta events as they arrive
+        chat_response_id = ""
+        chat_response_content = []
+        chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
+        chunk_created = 0
+        chunk_model = ""
+        chunk_finish_reason = ""
+        sequence_number = 0
+
+        # Create a placeholder message item for delta events
+        message_item_id = f"msg_{uuid.uuid4()}"
+
+        async for chunk in inference_result:
+            chat_response_id = chunk.id
+            chunk_created = chunk.created
+            chunk_model = chunk.model
+            for chunk_choice in chunk.choices:
+                # Emit incremental text content as delta events
+                if chunk_choice.delta.content:
+                    sequence_number += 1
+                    yield OpenAIResponseObjectStreamResponseOutputTextDelta(
+                        content_index=0,
+                        delta=chunk_choice.delta.content,
+                        item_id=message_item_id,
+                        output_index=0,
+                        sequence_number=sequence_number,
+                    )
+
+                # Collect content for final response
+                chat_response_content.append(chunk_choice.delta.content or "")
+                if chunk_choice.finish_reason:
+                    chunk_finish_reason = chunk_choice.finish_reason
+
+                # Aggregate tool call arguments across chunks, using their index as the aggregation key
+                if chunk_choice.delta.tool_calls:
+                    for tool_call in chunk_choice.delta.tool_calls:
+                        response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
+                        if response_tool_call:
+                            # Don't attempt to concatenate arguments if we don't have any new arguments
+                            if tool_call.function.arguments:
+                                # Guard against an initial None argument before we concatenate
+                                response_tool_call.function.arguments = (
+                                    response_tool_call.function.arguments or ""
+                                ) + tool_call.function.arguments
+                        else:
+                            tool_call_dict: dict[str, Any] = tool_call.model_dump()
+                            tool_call_dict.pop("type", None)
+                            response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
+                        chat_response_tool_calls[tool_call.index] = response_tool_call
+
+        # Convert collected chunks to complete response
+        if chat_response_tool_calls:
+            tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
+        else:
+            tool_calls = None
+        assistant_message = OpenAIAssistantMessageParam(
+            content="".join(chat_response_content),
+            tool_calls=tool_calls,
+        )
+        chat_response_obj = OpenAIChatCompletion(
+            id=chat_response_id,
+            choices=[
+                OpenAIChoice(
+                    message=assistant_message,
+                    finish_reason=chunk_finish_reason,
+                    index=0,
+                )
+            ],
+            created=chunk_created,
+            model=chunk_model,
+        )
+
+        # Process response choices (tool execution and message creation)
+        output_messages.extend(
+            await self._process_response_choices(
+                chat_response=chat_response_obj,
+                ctx=ctx,
+                tools=tools,
+            )
+        )
+
+        # Create final response
+        final_response = OpenAIResponseObject(
+            created_at=created_at,
+            id=response_id,
+            model=model,
+            object="response",
+            status="completed",
+            output=output_messages,
+        )
+
+        if store:
+            await self._store_response(
+                response=final_response,
+                input=input,
+            )
+
+        # Emit response.completed
+        yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
+
+    async def _convert_response_tools_to_chat_tools(
+        self, tools: list[OpenAIResponseInputTool]
+    ) -> tuple[
+        list[ChatCompletionToolParam],
+        dict[str, OpenAIResponseInputToolMCP],
+        OpenAIResponseOutput | None,
+    ]:
+        from llama_stack.apis.agents.openai_responses import (
+            MCPListToolsTool,
+        )
+        from llama_stack.apis.tools.tools import Tool
+
+        mcp_tool_to_server = {}
+
+        def make_openai_tool(tool_name: str, tool: Tool) -> ChatCompletionToolParam:
+            tool_def = ToolDefinition(
+                tool_name=tool_name,
+                description=tool.description,
+                parameters={
+                    param.name: ToolParamDefinition(
+                        param_type=param.parameter_type,
+                        description=param.description,
+                        required=param.required,
+                        default=param.default,
+                    )
+                    for param in tool.parameters
+                },
+            )
+            return convert_tooldef_to_openai_tool(tool_def)
+
+        mcp_list_message = None
+        chat_tools: list[ChatCompletionToolParam] = []
+        for input_tool in tools:
+            # TODO: Handle other tool types
+            if input_tool.type == "function":
+                chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))
+            elif input_tool.type == "web_search":
+                tool_name = "web_search"
+                tool = await self.tool_groups_api.get_tool(tool_name)
+                if not tool:
+                    raise ValueError(f"Tool {tool_name} not found")
+                chat_tools.append(make_openai_tool(tool_name, tool))
+            elif input_tool.type == "mcp":
+                always_allowed = None
+                never_allowed = None
+                if input_tool.allowed_tools:
+                    if isinstance(input_tool.allowed_tools, list):
+                        always_allowed = input_tool.allowed_tools
+                    elif isinstance(input_tool.allowed_tools, AllowedToolsFilter):
+                        always_allowed = input_tool.allowed_tools.always
+                        never_allowed = input_tool.allowed_tools.never
+
+                tool_defs = await list_mcp_tools(
+                    endpoint=input_tool.server_url,
+                    headers=input_tool.headers or {},
+                )
+
+                mcp_list_message = OpenAIResponseOutputMessageMCPListTools(
+                    id=f"mcp_list_{uuid.uuid4()}",
+                    status="completed",
+                    server_label=input_tool.server_label,
+                    tools=[],
+                )
+                for t in tool_defs.data:
+                    if never_allowed and t.name in never_allowed:
+                        continue
+                    if not always_allowed or t.name in always_allowed:
+                        chat_tools.append(make_openai_tool(t.name, t))
+                        if t.name in mcp_tool_to_server:
+                            raise ValueError(f"Duplicate tool name {t.name} found for server {input_tool.server_label}")
+                        mcp_tool_to_server[t.name] = input_tool
+                        mcp_list_message.tools.append(
+                            MCPListToolsTool(
+                                name=t.name,
+                                description=t.description,
+                                input_schema={
+                                    "type": "object",
+                                    "properties": {
+                                        p.name: {
+                                            "type": p.parameter_type,
+                                            "description": p.description,
+                                        }
+                                        for p in t.parameters
+                                    },
+                                    "required": [p.name for p in t.parameters if p.required],
+                                },
+                            )
+                        )
+            else:
+                raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
+        return chat_tools, mcp_tool_to_server, mcp_list_message
+
+    async def _execute_tool_and_return_final_output(
+        self,
+        choice: OpenAIChoice,
+        ctx: ChatCompletionContext,
+    ) -> list[OpenAIResponseOutput]:
+        output_messages: list[OpenAIResponseOutput] = []
+
+        if not isinstance(choice.message, OpenAIAssistantMessageParam):
+            return output_messages
+
+        if not choice.message.tool_calls:
+            return output_messages
+
+        next_turn_messages = ctx.messages.copy()
+
+        # Add the assistant message with tool_calls response to the messages list
+        next_turn_messages.append(choice.message)
+
+        for tool_call in choice.message.tool_calls:
+            # TODO: telemetry spans for tool calls
+            tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
+            if tool_call_log:
+                output_messages.append(tool_call_log)
+            if further_input:
+                next_turn_messages.append(further_input)
+
+        tool_results_chat_response = await self.inference_api.openai_chat_completion(
+            model=ctx.model,
+            messages=next_turn_messages,
+            stream=ctx.stream,
+            temperature=ctx.temperature,
+        )
+        # type cast to appease mypy: this is needed because we don't handle streaming properly :)
+        tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
+
+        # Huge TODO: these are NOT the final outputs, we must keep the loop going
+        tool_final_outputs = [
+            await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
+        ]
+        # TODO: Wire in annotations with URLs, titles, etc to these output messages
+        output_messages.extend(tool_final_outputs)
+        return output_messages
+
+    async def _execute_tool_call(
+        self,
+        tool_call: OpenAIChatCompletionToolCall,
+        ctx: ChatCompletionContext,
+    ) -> tuple[OpenAIResponseOutput | None, OpenAIMessageParam | None]:
+        from llama_stack.providers.utils.inference.prompt_adapter import (
+            interleaved_content_as_str,
+        )
+
+        tool_call_id = tool_call.id
+        function = tool_call.function
+
+        if not function or not tool_call_id or not function.name:
+            return None, None
+
+        error_exc = None
+        result = None
+        try:
+            if function.name in ctx.mcp_tool_to_server:
+                mcp_tool = ctx.mcp_tool_to_server[function.name]
+                result = await invoke_mcp_tool(
+                    endpoint=mcp_tool.server_url,
+                    headers=mcp_tool.headers or {},
+                    tool_name=function.name,
+                    kwargs=json.loads(function.arguments) if function.arguments else {},
+                )
+            else:
+                result = await self.tool_runtime_api.invoke_tool(
+                    tool_name=function.name,
+                    kwargs=json.loads(function.arguments) if function.arguments else {},
+                )
+        except Exception as e:
+            error_exc = e
+
+        if function.name in ctx.mcp_tool_to_server:
+            from llama_stack.apis.agents.openai_responses import OpenAIResponseOutputMessageMCPCall
+
+            message = OpenAIResponseOutputMessageMCPCall(
+                id=tool_call_id,
+                arguments=function.arguments,
+                name=function.name,
+                server_label=ctx.mcp_tool_to_server[function.name].server_label,
+            )
+            if error_exc:
+                message.error = str(error_exc)
+            elif (result.error_code and result.error_code > 0) or result.error_message:
+                message.error = f"Error (code {result.error_code}): {result.error_message}"
+            elif result.content:
+                message.output = interleaved_content_as_str(result.content)
+        else:
+            if function.name == "web_search":
+                message = OpenAIResponseOutputMessageWebSearchToolCall(
+                    id=tool_call_id,
+                    status="completed",
+                )
+                if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
+                    message.status = "failed"
+            else:
+                raise ValueError(f"Unknown tool {function.name} called")
+
+        input_message = None
+        if result and result.content:
+            if isinstance(result.content, str):
+                content = result.content
+            elif isinstance(result.content, list):
+                from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem
+
+                content = []
+                for item in result.content:
+                    if isinstance(item, TextContentItem):
+                        part = OpenAIChatCompletionContentPartTextParam(text=item.text)
+                    elif isinstance(item, ImageContentItem):
+                        if item.image.data:
+                            url = f"data:image;base64,{item.image.data}"
+                        else:
+                            url = item.image.url
+                        part = OpenAIChatCompletionContentPartImageParam(image_url=OpenAIImageURL(url=url))
+                    else:
+                        raise ValueError(f"Unknown result content type: {type(item)}")
+                    content.append(part)
+            else:
+                raise ValueError(f"Unknown result content type: {type(result.content)}")
+            input_message = OpenAIToolMessageParam(content=content, tool_call_id=tool_call_id)
+
+        return message, input_message
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 202d43609..5031a4a90 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -8,11 +8,8 @@ import json
 import logging
 import uuid
 from datetime import datetime, timezone
-from typing import List, Optional
 
-from pydantic import BaseModel
-
-from llama_stack.apis.agents import ToolExecutionStep, Turn
+from llama_stack.apis.agents import AgentConfig, Session, ToolExecutionStep, Turn
 from llama_stack.distribution.access_control import check_access
 from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.distribution.request_headers import get_auth_attributes
@@ -21,13 +18,15 @@ from llama_stack.providers.utils.kvstore import KVStore
 log = logging.getLogger(__name__)
 
 
-class AgentSessionInfo(BaseModel):
-    session_id: str
-    session_name: str
+class AgentSessionInfo(Session):
     # TODO: is this used anywhere?
-    vector_db_id: Optional[str] = None
+    vector_db_id: str | None = None
     started_at: datetime
-    access_attributes: Optional[AccessAttributes] = None
+    access_attributes: AccessAttributes | None = None
+
+
+class AgentInfo(AgentConfig):
+    created_at: datetime
 
 
 class AgentPersistence:
@@ -47,6 +46,7 @@ class AgentPersistence:
             session_name=name,
             started_at=datetime.now(timezone.utc),
             access_attributes=access_attributes,
+            turns=[],
         )
 
         await self.kvstore.set(
@@ -55,7 +55,7 @@ class AgentPersistence:
         )
         return session_id
 
-    async def get_session_info(self, session_id: str) -> Optional[AgentSessionInfo]:
+    async def get_session_info(self, session_id: str) -> AgentSessionInfo | None:
         value = await self.kvstore.get(
             key=f"session:{self.agent_id}:{session_id}",
         )
@@ -78,7 +78,7 @@ class AgentPersistence:
 
         return check_access(session_info.session_id, session_info.access_attributes, get_auth_attributes())
 
-    async def get_session_if_accessible(self, session_id: str) -> Optional[AgentSessionInfo]:
+    async def get_session_if_accessible(self, session_id: str) -> AgentSessionInfo | None:
         """Get session info if the user has access to it. For internal use by sub-session methods."""
         session_info = await self.get_session_info(session_id)
         if not session_info:
@@ -106,11 +106,11 @@ class AgentPersistence:
             value=turn.model_dump_json(),
         )
 
-    async def get_session_turns(self, session_id: str) -> List[Turn]:
+    async def get_session_turns(self, session_id: str) -> list[Turn]:
         if not await self.get_session_if_accessible(session_id):
             raise ValueError(f"Session {session_id} not found or access denied")
 
-        values = await self.kvstore.range(
+        values = await self.kvstore.values_in_range(
             start_key=f"session:{self.agent_id}:{session_id}:",
             end_key=f"session:{self.agent_id}:{session_id}:\xff\xff\xff\xff",
         )
@@ -122,10 +122,9 @@ class AgentPersistence:
             except Exception as e:
                 log.error(f"Error parsing turn: {e}")
                 continue
-        turns.sort(key=lambda x: (x.completed_at or datetime.min))
         return turns
 
-    async def get_session_turn(self, session_id: str, turn_id: str) -> Optional[Turn]:
+    async def get_session_turn(self, session_id: str, turn_id: str) -> Turn | None:
         if not await self.get_session_if_accessible(session_id):
             raise ValueError(f"Session {session_id} not found or access denied")
 
@@ -145,7 +144,7 @@ class AgentPersistence:
             value=step.model_dump_json(),
         )
 
-    async def get_in_progress_tool_call_step(self, session_id: str, turn_id: str) -> Optional[ToolExecutionStep]:
+    async def get_in_progress_tool_call_step(self, session_id: str, turn_id: str) -> ToolExecutionStep | None:
         if not await self.get_session_if_accessible(session_id):
             return None
 
@@ -163,7 +162,7 @@ class AgentPersistence:
             value=str(num_infer_iters),
         )
 
-    async def get_num_infer_iters_in_turn(self, session_id: str, turn_id: str) -> Optional[int]:
+    async def get_num_infer_iters_in_turn(self, session_id: str, turn_id: str) -> int | None:
         if not await self.get_session_if_accessible(session_id):
             return None
 
@@ -171,3 +170,43 @@ class AgentPersistence:
             key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
         )
         return int(value) if value else None
+
+    async def list_sessions(self) -> list[Session]:
+        values = await self.kvstore.values_in_range(
+            start_key=f"session:{self.agent_id}:",
+            end_key=f"session:{self.agent_id}:\xff\xff\xff\xff",
+        )
+        sessions = []
+        for value in values:
+            try:
+                session_info = Session(**json.loads(value))
+                sessions.append(session_info)
+            except Exception as e:
+                log.error(f"Error parsing session info: {e}")
+                continue
+        return sessions
+
+    async def delete_session_turns(self, session_id: str) -> None:
+        """Delete all turns and their associated data for a session.
+
+        Args:
+            session_id: The ID of the session whose turns should be deleted.
+        """
+        turns = await self.get_session_turns(session_id)
+        for turn in turns:
+            await self.kvstore.delete(key=f"session:{self.agent_id}:{session_id}:{turn.turn_id}")
+
+    async def delete_session(self, session_id: str) -> None:
+        """Delete a session and all its associated turns.
+
+        Args:
+            session_id: The ID of the session to delete.
+
+        Raises:
+            ValueError: If the session does not exist.
+        """
+        session_info = await self.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        await self.kvstore.delete(key=f"session:{self.agent_id}:{session_id}")
diff --git a/llama_stack/providers/inline/agents/meta_reference/safety.py b/llama_stack/providers/inline/agents/meta_reference/safety.py
index bef16eaba..6b3573d8c 100644
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@@ -6,7 +6,6 @@
 
 import asyncio
 import logging
-from typing import List
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
@@ -25,14 +24,14 @@ class ShieldRunnerMixin:
     def __init__(
         self,
         safety_api: Safety,
-        input_shields: List[str] = None,
-        output_shields: List[str] = None,
+        input_shields: list[str] = None,
+        output_shields: list[str] = None,
     ):
         self.safety_api = safety_api
         self.input_shields = input_shields
         self.output_shields = output_shields
 
-    async def run_multiple_shields(self, messages: List[Message], identifiers: List[str]) -> None:
+    async def run_multiple_shields(self, messages: list[Message], identifiers: list[str]) -> None:
         async def run_shield_with_span(identifier: str):
             async with tracing.span(f"run_shield_{identifier}"):
                 return await self.safety_api.run_shield(
diff --git a/llama_stack/providers/inline/datasetio/localfs/__init__.py b/llama_stack/providers/inline/datasetio/localfs/__init__.py
index 5a0876d79..58aa6ffaf 100644
--- a/llama_stack/providers/inline/datasetio/localfs/__init__.py
+++ b/llama_stack/providers/inline/datasetio/localfs/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from .config import LocalFSDatasetIOConfig
 
 
 async def get_provider_impl(
     config: LocalFSDatasetIOConfig,
-    _deps: Dict[str, Any],
+    _deps: dict[str, Any],
 ):
     from .datasetio import LocalFSDatasetIOImpl
 
diff --git a/llama_stack/providers/inline/datasetio/localfs/config.py b/llama_stack/providers/inline/datasetio/localfs/config.py
index d74521f1f..b450e8777 100644
--- a/llama_stack/providers/inline/datasetio/localfs/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -17,7 +17,7 @@ class LocalFSDatasetIOConfig(BaseModel):
     kvstore: KVStoreConfig
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "kvstore": SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=__distro_dir__,
diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
index e71107d61..da71ecb17 100644
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import pandas
 
@@ -11,9 +11,9 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import LocalFSDatasetIOConfig
 
@@ -64,7 +64,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         # Load existing datasets from kvstore
         start_key = DATASETS_PREFIX
         end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.range(start_key, end_key)
+        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
 
         for dataset in stored_datasets:
             dataset = Dataset.model_validate_json(dataset)
@@ -92,8 +92,8 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
     async def iterrows(
         self,
         dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
+        start_index: int | None = None,
+        limit: int | None = None,
     ) -> PaginatedResponse:
         dataset_def = self.dataset_infos[dataset_id]
         dataset_impl = PandasDataframeDataset(dataset_def)
@@ -102,7 +102,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         records = dataset_impl.df.to_dict("records")
         return paginate_records(records, start_index, limit)
 
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
         dataset_def = self.dataset_infos[dataset_id]
         dataset_impl = PandasDataframeDataset(dataset_def)
         await dataset_impl.load()
diff --git a/llama_stack/providers/inline/eval/meta_reference/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py
index e2a7fc2cd..7afe7f33b 100644
--- a/llama_stack/providers/inline/eval/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/eval/meta_reference/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -12,7 +12,7 @@ from .config import MetaReferenceEvalConfig
 
 async def get_provider_impl(
     config: MetaReferenceEvalConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .eval import MetaReferenceEvalImpl
 
diff --git a/llama_stack/providers/inline/eval/meta_reference/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py
index 5b2bec259..2a4a29998 100644
--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -17,7 +17,7 @@ class MetaReferenceEvalConfig(BaseModel):
     kvstore: KVStoreConfig
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "kvstore": SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=__distro_dir__,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 7c28f1bb7..bc0898dc5 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-from typing import Any, Dict, List
+from typing import Any
 
 from tqdm import tqdm
 
@@ -58,7 +58,7 @@ class MetaReferenceEvalImpl(
         # Load existing benchmarks from kvstore
         start_key = EVAL_TASKS_PREFIX
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_benchmarks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key)
 
         for benchmark in stored_benchmarks:
             benchmark = Benchmark.model_validate_json(benchmark)
@@ -105,8 +105,8 @@ class MetaReferenceEvalImpl(
         return Job(job_id=job_id, status=JobStatus.completed)
 
     async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
-    ) -> List[Dict[str, Any]]:
+        self, input_rows: list[dict[str, Any]], benchmark_config: BenchmarkConfig
+    ) -> list[dict[str, Any]]:
         candidate = benchmark_config.eval_candidate
         create_response = await self.agents_api.create_agent(candidate.config)
         agent_id = create_response.agent_id
@@ -148,8 +148,8 @@ class MetaReferenceEvalImpl(
         return generations
 
     async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
-    ) -> List[Dict[str, Any]]:
+        self, input_rows: list[dict[str, Any]], benchmark_config: BenchmarkConfig
+    ) -> list[dict[str, Any]]:
         candidate = benchmark_config.eval_candidate
         assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
 
@@ -185,8 +185,8 @@ class MetaReferenceEvalImpl(
     async def evaluate_rows(
         self,
         benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: list[str],
         benchmark_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         candidate = benchmark_config.eval_candidate
diff --git a/llama_stack/providers/inline/inference/meta_reference/__init__.py b/llama_stack/providers/inline/inference/meta_reference/__init__.py
index 3710766e2..5eb822429 100644
--- a/llama_stack/providers/inline/inference/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/inference/meta_reference/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from .config import MetaReferenceInferenceConfig
 
 
 async def get_provider_impl(
     config: MetaReferenceInferenceConfig,
-    _deps: Dict[str, Any],
+    _deps: dict[str, Any],
 ):
     from .inference import MetaReferenceInferenceImpl
 
diff --git a/llama_stack/providers/inline/inference/meta_reference/config.py b/llama_stack/providers/inline/inference/meta_reference/config.py
index 6f796d0d4..7bc961443 100644
--- a/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/llama_stack/providers/inline/inference/meta_reference/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, field_validator
 
@@ -17,11 +17,11 @@ class MetaReferenceInferenceConfig(BaseModel):
     # the actual inference model id is dtermined by the moddel id in the request
     # Note: you need to register the model before using it for inference
     # models in the resouce list in the run.yaml config will be registered automatically
-    model: Optional[str] = None
-    torch_seed: Optional[int] = None
+    model: str | None = None
+    torch_seed: int | None = None
     max_seq_len: int = 4096
     max_batch_size: int = 1
-    model_parallel_size: Optional[int] = None
+    model_parallel_size: int | None = None
 
     # when this is False, we assume that the distributed process group is setup by someone
     # outside of this code (e.g., when run inside `torchrun`). that is useful for clients
@@ -30,9 +30,9 @@ class MetaReferenceInferenceConfig(BaseModel):
 
     # By default, the implementation will look at ~/.llama/checkpoints/ but you
     # can override by specifying the directory explicitly
-    checkpoint_dir: Optional[str] = None
+    checkpoint_dir: str | None = None
 
-    quantization: Optional[QuantizationConfig] = None
+    quantization: QuantizationConfig | None = None
 
     @field_validator("model")
     @classmethod
@@ -55,7 +55,7 @@ class MetaReferenceInferenceConfig(BaseModel):
         max_batch_size: str = "${env.MAX_BATCH_SIZE:1}",
         max_seq_len: str = "${env.MAX_SEQ_LEN:4096}",
         **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         return {
             "model": model,
             "checkpoint_dir": checkpoint_dir,
diff --git a/llama_stack/providers/inline/inference/meta_reference/generators.py b/llama_stack/providers/inline/inference/meta_reference/generators.py
index 0a928ce73..cb926f529 100644
--- a/llama_stack/providers/inline/inference/meta_reference/generators.py
+++ b/llama_stack/providers/inline/inference/meta_reference/generators.py
@@ -5,7 +5,8 @@
 # the root directory of this source tree.
 
 import math
-from typing import Generator, List, Optional, Tuple
+from collections.abc import Generator
+from typing import Optional
 
 import torch
 from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
@@ -39,7 +40,7 @@ Tokenizer = Llama4Tokenizer | Llama3Tokenizer
 class LogitsProcessor:
     def __init__(self, token_enforcer: TokenEnforcer):
         self.token_enforcer = token_enforcer
-        self.mask: Optional[torch.Tensor] = None
+        self.mask: torch.Tensor | None = None
 
     def __call__(self, tokens: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
         token_sequence = tokens[0, :].tolist()
@@ -58,7 +59,7 @@ class LogitsProcessor:
 def get_logits_processor(
     tokenizer: Tokenizer,
     vocab_size: int,
-    response_format: Optional[ResponseFormat],
+    response_format: ResponseFormat | None,
 ) -> Optional["LogitsProcessor"]:
     if response_format is None:
         return None
@@ -76,7 +77,7 @@ def get_logits_processor(
     return LogitsProcessor(token_enforcer)
 
 
-def _build_regular_tokens_list(tokenizer: Tokenizer, vocab_size: int) -> List[Tuple[int, str, bool]]:
+def _build_regular_tokens_list(tokenizer: Tokenizer, vocab_size: int) -> list[tuple[int, str, bool]]:
     token_0 = tokenizer.encode("0", bos=False, eos=False)[-1]
     regular_tokens = []
 
@@ -158,7 +159,7 @@ class LlamaGenerator:
 
     def completion(
         self,
-        request_batch: List[CompletionRequestWithRawContent],
+        request_batch: list[CompletionRequestWithRawContent],
     ) -> Generator:
         first_request = request_batch[0]
         sampling_params = first_request.sampling_params or SamplingParams()
@@ -167,7 +168,7 @@ class LlamaGenerator:
             max_gen_len = self.args.max_seq_len - 1
 
         temperature, top_p = _infer_sampling_params(sampling_params)
-        for result in self.inner_generator.generate(
+        yield from self.inner_generator.generate(
             llm_inputs=[self.formatter.encode_content(request.content) for request in request_batch],
             max_gen_len=max_gen_len,
             temperature=temperature,
@@ -179,12 +180,11 @@ class LlamaGenerator:
                 self.args.vocab_size,
                 first_request.response_format,
             ),
-        ):
-            yield result
+        )
 
     def chat_completion(
         self,
-        request_batch: List[ChatCompletionRequestWithRawContent],
+        request_batch: list[ChatCompletionRequestWithRawContent],
     ) -> Generator:
         first_request = request_batch[0]
         sampling_params = first_request.sampling_params or SamplingParams()
@@ -193,7 +193,7 @@ class LlamaGenerator:
             max_gen_len = self.args.max_seq_len - 1
 
         temperature, top_p = _infer_sampling_params(sampling_params)
-        for result in self.inner_generator.generate(
+        yield from self.inner_generator.generate(
             llm_inputs=[
                 self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))
                 for request in request_batch
@@ -208,5 +208,4 @@ class LlamaGenerator:
                 self.args.vocab_size,
                 first_request.response_format,
             ),
-        ):
-            yield result
+        )
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 1bc098fab..e238e1b78 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -6,7 +6,8 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Optional, Union
+import sys
+from collections.abc import AsyncGenerator
 
 from pydantic import BaseModel
 from termcolor import cprint
@@ -28,7 +29,7 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    Inference,
+    InferenceProvider,
     InterleavedContent,
     LogProbConfig,
     Message,
@@ -86,7 +87,7 @@ class MetaReferenceInferenceImpl(
     OpenAICompletionToLlamaStackMixin,
     OpenAIChatCompletionToLlamaStackMixin,
     SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, config: MetaReferenceInferenceConfig) -> None:
@@ -184,11 +185,11 @@ class MetaReferenceInferenceImpl(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | CompletionResponseStreamChunk:
         if sampling_params is None:
             sampling_params = SamplingParams()
         if logprobs:
@@ -215,11 +216,11 @@ class MetaReferenceInferenceImpl(
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> BatchCompletionResponse:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -291,14 +292,14 @@ class MetaReferenceInferenceImpl(
             for x in impl():
                 yield x
 
-    async def _nonstream_completion(self, request_batch: List[CompletionRequest]) -> List[CompletionResponse]:
+    async def _nonstream_completion(self, request_batch: list[CompletionRequest]) -> list[CompletionResponse]:
         tokenizer = self.generator.formatter.tokenizer
 
         first_request = request_batch[0]
 
         class ItemState(BaseModel):
-            tokens: List[int] = []
-            logprobs: List[TokenLogProbs] = []
+            tokens: list[int] = []
+            logprobs: list[TokenLogProbs] = []
             stop_reason: StopReason | None = None
             finished: bool = False
 
@@ -349,15 +350,15 @@ class MetaReferenceInferenceImpl(
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -395,13 +396,13 @@ class MetaReferenceInferenceImpl(
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> BatchChatCompletionResponse:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -436,15 +437,15 @@ class MetaReferenceInferenceImpl(
         return BatchChatCompletionResponse(batch=results)
 
     async def _nonstream_chat_completion(
-        self, request_batch: List[ChatCompletionRequest]
-    ) -> List[ChatCompletionResponse]:
+        self, request_batch: list[ChatCompletionRequest]
+    ) -> list[ChatCompletionResponse]:
         tokenizer = self.generator.formatter.tokenizer
 
         first_request = request_batch[0]
 
         class ItemState(BaseModel):
-            tokens: List[int] = []
-            logprobs: List[TokenLogProbs] = []
+            tokens: list[int] = []
+            logprobs: list[TokenLogProbs] = []
             stop_reason: StopReason | None = None
             finished: bool = False
 
@@ -455,9 +456,9 @@ class MetaReferenceInferenceImpl(
                 first = token_results[0]
                 if not first.finished and not first.ignore_token:
                     if os.environ.get("LLAMA_MODELS_DEBUG", "0") in ("1", "2"):
-                        cprint(first.text, "cyan", end="")
+                        cprint(first.text, color="cyan", end="", file=sys.stderr)
                     if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
-                        cprint(f"<{first.token}>", "magenta", end="")
+                        cprint(f"<{first.token}>", color="magenta", end="", file=sys.stderr)
 
                 for result in token_results:
                     idx = result.batch_idx
@@ -519,9 +520,9 @@ class MetaReferenceInferenceImpl(
             for token_results in self.generator.chat_completion([request]):
                 token_result = token_results[0]
                 if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
-                    cprint(token_result.text, "cyan", end="")
+                    cprint(token_result.text, color="cyan", end="", file=sys.stderr)
                 if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
-                    cprint(f"<{token_result.token}>", "magenta", end="")
+                    cprint(f"<{token_result.token}>", color="magenta", end="", file=sys.stderr)
 
                 if token_result.token == tokenizer.eot_id:
                     stop_reason = StopReason.end_of_turn
diff --git a/llama_stack/providers/inline/inference/meta_reference/model_parallel.py b/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
index 50640c6d1..9031d36b3 100644
--- a/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
+++ b/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
@@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from collections.abc import Callable, Generator
 from copy import deepcopy
 from functools import partial
-from typing import Any, Callable, Generator, List
+from typing import Any
 
 from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
 from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
@@ -82,7 +83,7 @@ class LlamaModelParallelGenerator:
 
     def completion(
         self,
-        request_batch: List[CompletionRequestWithRawContent],
+        request_batch: list[CompletionRequestWithRawContent],
     ) -> Generator:
         req_obj = deepcopy(request_batch)
         gen = self.group.run_inference(("completion", req_obj))
@@ -90,7 +91,7 @@ class LlamaModelParallelGenerator:
 
     def chat_completion(
         self,
-        request_batch: List[ChatCompletionRequestWithRawContent],
+        request_batch: list[ChatCompletionRequestWithRawContent],
     ) -> Generator:
         req_obj = deepcopy(request_batch)
         gen = self.group.run_inference(("chat_completion", req_obj))
diff --git a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
index 8c0ffc632..97e96b929 100644
--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@@ -18,8 +18,9 @@ import os
 import tempfile
 import time
 import uuid
+from collections.abc import Callable, Generator
 from enum import Enum
-from typing import Callable, Generator, List, Literal, Optional, Tuple, Union
+from typing import Annotated, Literal
 
 import torch
 import zmq
@@ -30,7 +31,6 @@ from fairscale.nn.model_parallel.initialize import (
 )
 from pydantic import BaseModel, Field
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
-from typing_extensions import Annotated
 
 from llama_stack.models.llama.datatypes import GenerationResult
 from llama_stack.providers.utils.inference.prompt_adapter import (
@@ -69,15 +69,15 @@ class CancelSentinel(BaseModel):
 
 class TaskRequest(BaseModel):
     type: Literal[ProcessingMessageName.task_request] = ProcessingMessageName.task_request
-    task: Tuple[
+    task: tuple[
         str,
-        List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent],
+        list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
     ]
 
 
 class TaskResponse(BaseModel):
     type: Literal[ProcessingMessageName.task_response] = ProcessingMessageName.task_response
-    result: List[GenerationResult]
+    result: list[GenerationResult]
 
 
 class ExceptionResponse(BaseModel):
@@ -85,15 +85,9 @@ class ExceptionResponse(BaseModel):
     error: str
 
 
-ProcessingMessage = Union[
-    ReadyRequest,
-    ReadyResponse,
-    EndSentinel,
-    CancelSentinel,
-    TaskRequest,
-    TaskResponse,
-    ExceptionResponse,
-]
+ProcessingMessage = (
+    ReadyRequest | ReadyResponse | EndSentinel | CancelSentinel | TaskRequest | TaskResponse | ExceptionResponse
+)
 
 
 class ProcessingMessageWrapper(BaseModel):
@@ -203,7 +197,7 @@ def maybe_get_work(sock: zmq.Socket):
     return client_id, message
 
 
-def maybe_parse_message(maybe_json: Optional[str]) -> Optional[ProcessingMessage]:
+def maybe_parse_message(maybe_json: str | None) -> ProcessingMessage | None:
     if maybe_json is None:
         return None
     try:
@@ -334,9 +328,9 @@ class ModelParallelProcessGroup:
 
     def run_inference(
         self,
-        req: Tuple[
+        req: tuple[
             str,
-            List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent],
+            list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
         ],
     ) -> Generator:
         assert not self.running, "inference already running"
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
index c1d65d10c..1719cbacc 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.inline.inference.sentence_transformers.config import (
     SentenceTransformersInferenceConfig,
@@ -13,7 +13,7 @@ from llama_stack.providers.inline.inference.sentence_transformers.config import
 
 async def get_provider_impl(
     config: SentenceTransformersInferenceConfig,
-    _deps: Dict[str, Any],
+    _deps: dict[str, Any],
 ):
     from .sentence_transformers import SentenceTransformersInferenceImpl
 
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/config.py b/llama_stack/providers/inline/inference/sentence_transformers/config.py
index 93e0afe11..b03010b10 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/config.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/config.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class SentenceTransformersInferenceConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index d717d055f..890c526f5 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -5,11 +5,11 @@
 # the root directory of this source tree.
 
 import logging
-from typing import AsyncGenerator, List, Optional, Union
+from collections.abc import AsyncGenerator
 
 from llama_stack.apis.inference import (
     CompletionResponse,
-    Inference,
+    InferenceProvider,
     InterleavedContent,
     LogProbConfig,
     Message,
@@ -38,7 +38,7 @@ class SentenceTransformersInferenceImpl(
     OpenAIChatCompletionToLlamaStackMixin,
     OpenAICompletionToLlamaStackMixin,
     SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
@@ -60,46 +60,46 @@ class SentenceTransformersInferenceImpl(
         self,
         model_id: str,
         content: str,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncGenerator]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | AsyncGenerator:
         raise ValueError("Sentence transformers don't support completion")
 
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         raise ValueError("Sentence transformers don't support chat completion")
 
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch completion is not supported for Sentence Transformers")
 
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
diff --git a/llama_stack/providers/inline/inference/vllm/__init__.py b/llama_stack/providers/inline/inference/vllm/__init__.py
index bd0551e57..d0ec3e084 100644
--- a/llama_stack/providers/inline/inference/vllm/__init__.py
+++ b/llama_stack/providers/inline/inference/vllm/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from .config import VLLMConfig
 
 
-async def get_provider_impl(config: VLLMConfig, _deps: Dict[str, Any]):
+async def get_provider_impl(config: VLLMConfig, _deps: dict[str, Any]):
     from .vllm import VLLMInferenceImpl
 
     impl = VLLMInferenceImpl(config)
diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py
index 51d48e6d5..ce8743c74 100644
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -42,7 +42,7 @@ class VLLMConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
         return {
             "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
             "max_tokens": "${env.MAX_TOKENS:4096}",
diff --git a/llama_stack/providers/inline/inference/vllm/openai_utils.py b/llama_stack/providers/inline/inference/vllm/openai_utils.py
index d34f5ad5f..77cbf0403 100644
--- a/llama_stack/providers/inline/inference/vllm/openai_utils.py
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Optional
 
 import vllm
 
@@ -55,8 +54,8 @@ def _merge_context_into_content(message: Message) -> Message:  # type: ignore
 
 
 def _llama_stack_tools_to_openai_tools(
-    tools: Optional[List[ToolDefinition]] = None,
-) -> List[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
+    tools: list[ToolDefinition] | None = None,
+) -> list[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
     """
     Convert the list of available tools from Llama Stack's format to vLLM's
     version of OpenAI's format.
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index 9d742c39c..bf54462b5 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -7,7 +7,7 @@
 import json
 import re
 import uuid
-from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
 
 # These vLLM modules contain names that overlap with Llama Stack names, so we import
 # fully-qualified names
@@ -40,6 +40,7 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -100,7 +101,7 @@ def _random_uuid_str() -> str:
 
 
 def _response_format_to_guided_decoding_params(
-    response_format: Optional[ResponseFormat],  # type: ignore
+    response_format: ResponseFormat | None,  # type: ignore
 ) -> vllm.sampling_params.GuidedDecodingParams:
     """
     Translate constrained decoding parameters from Llama Stack's format to vLLM's format.
@@ -131,9 +132,9 @@ def _response_format_to_guided_decoding_params(
 
 
 def _convert_sampling_params(
-    sampling_params: Optional[SamplingParams],
-    response_format: Optional[ResponseFormat],  # type: ignore
-    log_prob_config: Optional[LogProbConfig],
+    sampling_params: SamplingParams | None,
+    response_format: ResponseFormat | None,  # type: ignore
+    log_prob_config: LogProbConfig | None,
 ) -> vllm.SamplingParams:
     """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's
     format."""
@@ -370,11 +371,11 @@ class VLLMInferenceImpl(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
         if model_id not in self.model_ids:
             raise ValueError(
                 f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
@@ -403,25 +404,35 @@ class VLLMInferenceImpl(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],  # type: ignore
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,  # type: ignore
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],  # type: ignore
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,  # type: ignore
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
         sampling_params = sampling_params or SamplingParams()
         if model_id not in self.model_ids:
@@ -605,7 +616,7 @@ class VLLMInferenceImpl(
 
     async def _chat_completion_for_meta_llama(
         self, request: ChatCompletionRequest
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         """
         Subroutine that routes chat completions for Meta Llama models through Llama Stack's
         chat template instead of using vLLM's version of that template. The Llama Stack version
@@ -701,7 +712,7 @@ class VLLMInferenceImpl(
         # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up
         # those chunks and output them at the end.
         # This data structure holds the current set of partial tool calls.
-        index_to_tool_call: Dict[int, Dict] = dict()
+        index_to_tool_call: dict[int, dict] = dict()
 
         # The Llama Stack event stream must always start with a start event. Use an empty one to
         # simplify logic below
diff --git a/llama_stack/providers/inline/post_training/common/utils.py b/llama_stack/providers/inline/post_training/common/utils.py
new file mode 100644
index 000000000..7840b21e8
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/common/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import gc
+
+
+def evacuate_model_from_device(model, device: str):
+    """Safely clear a model from memory and free device resources.
+    This function handles the proper cleanup of a model by:
+    1. Moving the model to CPU if it's on a non-CPU device
+    2. Deleting the model object to free memory
+    3. Running garbage collection
+    4. Clearing CUDA cache if the model was on a CUDA device
+    Args:
+        model: The PyTorch model to clear
+        device: The device type the model is currently on ('cuda', 'mps', 'cpu')
+    Note:
+        - For CUDA devices, this will clear the CUDA cache after moving the model to CPU
+        - For MPS devices, only moves the model to CPU (no cache clearing available)
+        - For CPU devices, only deletes the model object and runs garbage collection
+    """
+    if device != "cpu":
+        model.to("cpu")
+
+    del model
+    gc.collect()
+
+    if device == "cuda":
+        # we need to import such that this is only imported when the method is called
+        import torch
+
+        torch.cuda.empty_cache()
diff --git a/llama_stack/providers/inline/post_training/common/validator.py b/llama_stack/providers/inline/post_training/common/validator.py
index b0aec6187..950b75f86 100644
--- a/llama_stack/providers/inline/post_training/common/validator.py
+++ b/llama_stack/providers/inline/post_training/common/validator.py
@@ -17,10 +17,8 @@ from llama_stack.apis.common.type_system import (
     DialogType,
     StringType,
 )
-from llama_stack.apis.datasets import Datasets
 from llama_stack.providers.utils.common.data_schema_validator import (
     ColumnName,
-    validate_dataset_schema,
 )
 
 EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
@@ -36,21 +34,3 @@ EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
         }
     ],
 }
-
-
-async def validate_input_dataset_schema(
-    datasets_api: Datasets,
-    dataset_id: str,
-    dataset_type: str,
-) -> None:
-    dataset_def = await datasets_api.get_dataset(dataset_id=dataset_id)
-    if not dataset_def:
-        raise ValueError(f"Dataset {dataset_id} does not exist.")
-
-    if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
-        raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
-
-    if dataset_type not in EXPECTED_DATASET_SCHEMA:
-        raise ValueError(f"Dataset type {dataset_type} is not supported.")
-
-    validate_dataset_schema(dataset_def.dataset_schema, EXPECTED_DATASET_SCHEMA[dataset_type])
diff --git a/llama_stack/providers/inline/post_training/huggingface/__init__.py b/llama_stack/providers/inline/post_training/huggingface/__init__.py
new file mode 100644
index 000000000..cc1a671c1
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.distribution.datatypes import Api
+
+from .config import HuggingFacePostTrainingConfig
+
+# post_training api and the huggingface provider is still experimental and under heavy development
+
+
+async def get_provider_impl(
+    config: HuggingFacePostTrainingConfig,
+    deps: dict[Api, Any],
+):
+    from .post_training import HuggingFacePostTrainingImpl
+
+    impl = HuggingFacePostTrainingImpl(
+        config,
+        deps[Api.datasetio],
+        deps[Api.datasets],
+    )
+    return impl
diff --git a/llama_stack/providers/inline/post_training/huggingface/config.py b/llama_stack/providers/inline/post_training/huggingface/config.py
new file mode 100644
index 000000000..06c6d8073
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/config.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Literal
+
+from pydantic import BaseModel
+
+
+class HuggingFacePostTrainingConfig(BaseModel):
+    # Device to run training on (cuda, cpu, mps)
+    device: str = "cuda"
+
+    # Distributed training backend if using multiple devices
+    # fsdp: Fully Sharded Data Parallel
+    # deepspeed: DeepSpeed ZeRO optimization
+    distributed_backend: Literal["fsdp", "deepspeed"] | None = None
+
+    # Format for saving model checkpoints
+    # full_state: Save complete model state
+    # huggingface: Save in HuggingFace format (recommended for compatibility)
+    checkpoint_format: Literal["full_state", "huggingface"] | None = "huggingface"
+
+    # Template for formatting chat inputs and outputs
+    # Used to structure the conversation format for training
+    chat_template: str = "<|user|>\n{input}\n<|assistant|>\n{output}"
+
+    # Model-specific configuration parameters
+    # trust_remote_code: Allow execution of custom model code
+    # attn_implementation: Use SDPA (Scaled Dot Product Attention) for better performance
+    model_specific_config: dict = {
+        "trust_remote_code": True,
+        "attn_implementation": "sdpa",
+    }
+
+    # Maximum sequence length for training
+    # Set to 2048 as this is the maximum that works reliably on MPS (Apple Silicon)
+    # Longer sequences may cause memory issues on MPS devices
+    max_seq_length: int = 2048
+
+    # Enable gradient checkpointing to reduce memory usage
+    # Trades computation for memory by recomputing activations
+    gradient_checkpointing: bool = False
+
+    # Maximum number of checkpoints to keep
+    # Older checkpoints are deleted when this limit is reached
+    save_total_limit: int = 3
+
+    # Number of training steps between logging updates
+    logging_steps: int = 10
+
+    # Ratio of training steps used for learning rate warmup
+    # Helps stabilize early training
+    warmup_ratio: float = 0.1
+
+    # L2 regularization coefficient
+    # Helps prevent overfitting
+    weight_decay: float = 0.01
+
+    # Number of worker processes for data loading
+    # Higher values can improve data loading speed but increase memory usage
+    dataloader_num_workers: int = 4
+
+    # Whether to pin memory in data loader
+    # Can improve data transfer speed to GPU but uses more memory
+    dataloader_pin_memory: bool = True
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
+        return {"checkpoint_format": "huggingface", "distributed_backend": None, "device": "cpu"}
diff --git a/llama_stack/providers/inline/post_training/huggingface/post_training.py b/llama_stack/providers/inline/post_training/huggingface/post_training.py
new file mode 100644
index 000000000..0b2760792
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/post_training.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from enum import Enum
+from typing import Any
+
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.post_training import (
+    AlgorithmConfig,
+    Checkpoint,
+    DPOAlignmentConfig,
+    JobStatus,
+    ListPostTrainingJobsResponse,
+    PostTrainingJob,
+    PostTrainingJobArtifactsResponse,
+    PostTrainingJobStatusResponse,
+    TrainingConfig,
+)
+from llama_stack.providers.inline.post_training.huggingface.config import (
+    HuggingFacePostTrainingConfig,
+)
+from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device import (
+    HFFinetuningSingleDevice,
+)
+from llama_stack.providers.utils.scheduler import JobArtifact, Scheduler
+from llama_stack.providers.utils.scheduler import JobStatus as SchedulerJobStatus
+from llama_stack.schema_utils import webmethod
+
+
+class TrainingArtifactType(Enum):
+    CHECKPOINT = "checkpoint"
+    RESOURCES_STATS = "resources_stats"
+
+
+_JOB_TYPE_SUPERVISED_FINE_TUNE = "supervised-fine-tune"
+
+
+class HuggingFacePostTrainingImpl:
+    def __init__(
+        self,
+        config: HuggingFacePostTrainingConfig,
+        datasetio_api: DatasetIO,
+        datasets: Datasets,
+    ) -> None:
+        self.config = config
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets
+        self._scheduler = Scheduler()
+
+    async def shutdown(self) -> None:
+        await self._scheduler.shutdown()
+
+    @staticmethod
+    def _checkpoint_to_artifact(checkpoint: Checkpoint) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.CHECKPOINT.value,
+            name=checkpoint.identifier,
+            uri=checkpoint.path,
+            metadata=dict(checkpoint),
+        )
+
+    @staticmethod
+    def _resources_stats_to_artifact(resources_stats: dict[str, Any]) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.RESOURCES_STATS.value,
+            name=TrainingArtifactType.RESOURCES_STATS.value,
+            metadata=resources_stats,
+        )
+
+    async def supervised_fine_tune(
+        self,
+        job_uuid: str,
+        training_config: TrainingConfig,
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+        model: str,
+        checkpoint_dir: str | None = None,
+        algorithm_config: AlgorithmConfig | None = None,
+    ) -> PostTrainingJob:
+        async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb):
+            on_log_message_cb("Starting HF finetuning")
+
+            recipe = HFFinetuningSingleDevice(
+                job_uuid=job_uuid,
+                datasetio_api=self.datasetio_api,
+                datasets_api=self.datasets_api,
+            )
+
+            resources_allocated, checkpoints = await recipe.train(
+                model=model,
+                output_dir=checkpoint_dir,
+                job_uuid=job_uuid,
+                lora_config=algorithm_config,
+                config=training_config,
+                provider_config=self.config,
+            )
+
+            on_artifact_collected_cb(self._resources_stats_to_artifact(resources_allocated))
+            if checkpoints:
+                for checkpoint in checkpoints:
+                    artifact = self._checkpoint_to_artifact(checkpoint)
+                    on_artifact_collected_cb(artifact)
+
+            on_status_change_cb(SchedulerJobStatus.completed)
+            on_log_message_cb("HF finetuning completed")
+
+        job_uuid = self._scheduler.schedule(_JOB_TYPE_SUPERVISED_FINE_TUNE, job_uuid, handler)
+        return PostTrainingJob(job_uuid=job_uuid)
+
+    async def preference_optimize(
+        self,
+        job_uuid: str,
+        finetuned_model: str,
+        algorithm_config: DPOAlignmentConfig,
+        training_config: TrainingConfig,
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+    ) -> PostTrainingJob:
+        raise NotImplementedError("DPO alignment is not implemented yet")
+
+    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
+        return ListPostTrainingJobsResponse(
+            data=[PostTrainingJob(job_uuid=job.id) for job in self._scheduler.get_jobs()]
+        )
+
+    @staticmethod
+    def _get_artifacts_metadata_by_type(job, artifact_type):
+        return [artifact.metadata for artifact in job.artifacts if artifact.type == artifact_type]
+
+    @classmethod
+    def _get_checkpoints(cls, job):
+        return cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.CHECKPOINT.value)
+
+    @classmethod
+    def _get_resources_allocated(cls, job):
+        data = cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.RESOURCES_STATS.value)
+        return data[0] if data else None
+
+    @webmethod(route="/post-training/job/status")
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse | None:
+        job = self._scheduler.get_job(job_uuid)
+
+        match job.status:
+            # TODO: Add support for other statuses to API
+            case SchedulerJobStatus.new | SchedulerJobStatus.scheduled:
+                status = JobStatus.scheduled
+            case SchedulerJobStatus.running:
+                status = JobStatus.in_progress
+            case SchedulerJobStatus.completed:
+                status = JobStatus.completed
+            case SchedulerJobStatus.failed:
+                status = JobStatus.failed
+            case _:
+                raise NotImplementedError()
+
+        return PostTrainingJobStatusResponse(
+            job_uuid=job_uuid,
+            status=status,
+            scheduled_at=job.scheduled_at,
+            started_at=job.started_at,
+            completed_at=job.completed_at,
+            checkpoints=self._get_checkpoints(job),
+            resources_allocated=self._get_resources_allocated(job),
+        )
+
+    @webmethod(route="/post-training/job/cancel")
+    async def cancel_training_job(self, job_uuid: str) -> None:
+        self._scheduler.cancel(job_uuid)
+
+    @webmethod(route="/post-training/job/artifacts")
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse | None:
+        job = self._scheduler.get_job(job_uuid)
+        return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job))
diff --git a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
new file mode 100644
index 000000000..b6d13b029
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
@@ -0,0 +1,683 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import gc
+import json
+import logging
+import multiprocessing
+import os
+import signal
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import psutil
+
+from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
+
+# Set tokenizer parallelism environment variable
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Force PyTorch to use OpenBLAS instead of MKL
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+os.environ["MKL_SERVICE_FORCE_INTEL"] = "0"
+os.environ["MKL_NUM_THREADS"] = "1"
+
+import torch
+from datasets import Dataset
+from peft import LoraConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+from trl import SFTConfig, SFTTrainer
+
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.post_training import (
+    Checkpoint,
+    DataConfig,
+    LoraFinetuningConfig,
+    TrainingConfig,
+)
+
+from ..config import HuggingFacePostTrainingConfig
+
+logger = logging.getLogger(__name__)
+
+
+def get_gb(to_convert: int) -> str:
+    """Converts memory stats to GB and formats to 2 decimal places.
+    Args:
+        to_convert: Memory value in bytes
+    Returns:
+        str: Memory value in GB formatted to 2 decimal places
+    """
+    return f"{(to_convert / (1024**3)):.2f}"
+
+
+def get_memory_stats(device: torch.device) -> dict[str, Any]:
+    """Get memory statistics for the given device."""
+    stats = {
+        "system_memory": {
+            "total": get_gb(psutil.virtual_memory().total),
+            "available": get_gb(psutil.virtual_memory().available),
+            "used": get_gb(psutil.virtual_memory().used),
+            "percent": psutil.virtual_memory().percent,
+        }
+    }
+
+    if device.type == "cuda":
+        stats["device_memory"] = {
+            "allocated": get_gb(torch.cuda.memory_allocated(device)),
+            "reserved": get_gb(torch.cuda.memory_reserved(device)),
+            "max_allocated": get_gb(torch.cuda.max_memory_allocated(device)),
+        }
+    elif device.type == "mps":
+        # MPS doesn't provide direct memory stats, but we can track system memory
+        stats["device_memory"] = {
+            "note": "MPS memory stats not directly available",
+            "system_memory_used": get_gb(psutil.virtual_memory().used),
+        }
+    elif device.type == "cpu":
+        # For CPU, we track process memory usage
+        process = psutil.Process()
+        stats["device_memory"] = {
+            "process_rss": get_gb(process.memory_info().rss),
+            "process_vms": get_gb(process.memory_info().vms),
+            "process_percent": process.memory_percent(),
+        }
+
+    return stats
+
+
+def setup_torch_device(device_str: str) -> torch.device:
+    """Initialize and validate a PyTorch device.
+    This function handles device initialization and validation for different device types:
+    - CUDA: Validates CUDA availability and handles device selection
+    - MPS: Validates MPS availability for Apple Silicon
+    - CPU: Basic validation
+    - HPU: Raises error as it's not supported
+    Args:
+        device_str: String specifying the device ('cuda', 'cpu', 'mps')
+    Returns:
+        torch.device: The initialized and validated device
+    Raises:
+        RuntimeError: If device initialization fails or device is not supported
+    """
+    try:
+        device = torch.device(device_str)
+    except RuntimeError as e:
+        raise RuntimeError(f"Error getting Torch Device {str(e)}") from e
+
+    # Validate device capabilities
+    if device.type == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                f"{device.type}: Torch has no CUDA/ROCm support or could not detect a compatible device."
+            )
+        if device.index is None:
+            device = torch.device(device.type, torch.cuda.current_device())
+    elif device.type == "mps":
+        if not torch.backends.mps.is_available():
+            raise RuntimeError(f"{device.type}: Torch has no MPS support or could not detect a compatible device.")
+    elif device.type == "hpu":
+        raise RuntimeError(f"{device.type}: training does not support Intel Gaudi.")
+
+    return device
+
+
+class HFFinetuningSingleDevice:
+    def __init__(
+        self,
+        job_uuid: str,
+        datasetio_api: DatasetIO,
+        datasets_api: Datasets,
+    ):
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets_api
+        self.job_uuid = job_uuid
+
+    def validate_dataset_format(self, rows: list[dict]) -> bool:
+        """Validate that the dataset has the required fields."""
+        required_fields = ["input_query", "expected_answer", "chat_completion_input"]
+        return all(field in row for row in rows for field in required_fields)
+
+    def _process_instruct_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row in instruct format."""
+        if "chat_completion_input" in row and "expected_answer" in row:
+            try:
+                messages = json.loads(row["chat_completion_input"])
+                if not isinstance(messages, list) or len(messages) != 1:
+                    logger.warning(f"Invalid chat_completion_input format: {row['chat_completion_input']}")
+                    return None, None
+                if "content" not in messages[0]:
+                    logger.warning(f"Message missing content: {messages[0]}")
+                    return None, None
+                return messages[0]["content"], row["expected_answer"]
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse chat_completion_input: {row['chat_completion_input']}")
+                return None, None
+        return None, None
+
+    def _process_dialog_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row in dialog format."""
+        if "dialog" in row:
+            try:
+                dialog = json.loads(row["dialog"])
+                if not isinstance(dialog, list) or len(dialog) < 2:
+                    logger.warning(f"Dialog must have at least 2 messages: {row['dialog']}")
+                    return None, None
+                if dialog[0].get("role") != "user":
+                    logger.warning(f"First message must be from user: {dialog[0]}")
+                    return None, None
+                if not any(msg.get("role") == "assistant" for msg in dialog):
+                    logger.warning("Dialog must have at least one assistant message")
+                    return None, None
+
+                # Convert to human/gpt format
+                role_map = {"user": "human", "assistant": "gpt"}
+                conversations = []
+                for msg in dialog:
+                    if "role" not in msg or "content" not in msg:
+                        logger.warning(f"Message missing role or content: {msg}")
+                        continue
+                    conversations.append({"from": role_map[msg["role"]], "value": msg["content"]})
+
+                # Format as a single conversation
+                return conversations[0]["value"], conversations[1]["value"]
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse dialog: {row['dialog']}")
+                return None, None
+        return None, None
+
+    def _process_fallback_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row using fallback formats."""
+        if "input" in row and "output" in row:
+            return row["input"], row["output"]
+        elif "prompt" in row and "completion" in row:
+            return row["prompt"], row["completion"]
+        elif "question" in row and "answer" in row:
+            return row["question"], row["answer"]
+        return None, None
+
+    def _format_text(self, input_text: str, output_text: str, provider_config: HuggingFacePostTrainingConfig) -> str:
+        """Format input and output text based on model requirements."""
+        if hasattr(provider_config, "chat_template"):
+            return provider_config.chat_template.format(input=input_text, output=output_text)
+        return f"{input_text}\n{output_text}"
+
+    def _create_dataset(
+        self, rows: list[dict], config: TrainingConfig, provider_config: HuggingFacePostTrainingConfig
+    ) -> Dataset:
+        """Create and preprocess the dataset."""
+        formatted_rows = []
+        for row in rows:
+            input_text = None
+            output_text = None
+
+            # Process based on format
+            assert isinstance(config.data_config, DataConfig), "DataConfig must be initialized"
+            if config.data_config.data_format.value == "instruct":
+                input_text, output_text = self._process_instruct_format(row)
+            elif config.data_config.data_format.value == "dialog":
+                input_text, output_text = self._process_dialog_format(row)
+            else:
+                input_text, output_text = self._process_fallback_format(row)
+
+            if input_text and output_text:
+                formatted_text = self._format_text(input_text, output_text, provider_config)
+                formatted_rows.append({"text": formatted_text})
+
+        if not formatted_rows:
+            assert isinstance(config.data_config, DataConfig), "DataConfig must be initialized"
+            raise ValueError(
+                f"No valid input/output pairs found in the dataset for format: {config.data_config.data_format.value}"
+            )
+
+        return Dataset.from_list(formatted_rows)
+
+    def _preprocess_dataset(
+        self, ds: Dataset, tokenizer: AutoTokenizer, provider_config: HuggingFacePostTrainingConfig
+    ) -> Dataset:
+        """Preprocess the dataset with tokenizer."""
+
+        def tokenize_function(examples):
+            return tokenizer(
+                examples["text"],
+                padding=True,
+                truncation=True,
+                max_length=provider_config.max_seq_length,
+                return_tensors=None,
+            )
+
+        return ds.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=ds.column_names,
+        )
+
+    async def _setup_data(self, dataset_id: str) -> list[dict[str, Any]]:
+        """Load dataset from llama stack dataset provider"""
+        try:
+            all_rows = await self.datasetio_api.iterrows(
+                dataset_id=dataset_id,
+                limit=-1,
+            )
+            if not isinstance(all_rows.data, list):
+                raise RuntimeError("Expected dataset data to be a list")
+            return all_rows.data
+        except Exception as e:
+            raise RuntimeError(f"Failed to load dataset: {str(e)}") from e
+
+    def _run_training_sync(
+        self,
+        model: str,
+        provider_config: dict[str, Any],
+        peft_config: LoraConfig | None,
+        config: dict[str, Any],
+        output_dir_path: Path | None,
+    ) -> None:
+        """Synchronous wrapper for running training process.
+        This method serves as a bridge between the multiprocessing Process and the async training function.
+        It creates a new event loop to run the async training process.
+        Args:
+            model: The model identifier to load
+            dataset_id: ID of the dataset to use for training
+            provider_config: Configuration specific to the HuggingFace provider
+            peft_config: Optional LoRA configuration
+            config: General training configuration
+            output_dir_path: Optional path to save the model
+        """
+        import asyncio
+
+        logger.info("Starting training process with async wrapper")
+        asyncio.run(
+            self._run_training(
+                model=model,
+                provider_config=provider_config,
+                peft_config=peft_config,
+                config=config,
+                output_dir_path=output_dir_path,
+            )
+        )
+
+    async def load_dataset(
+        self,
+        model: str,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> tuple[Dataset, Dataset, AutoTokenizer]:
+        """Load and prepare the dataset for training.
+        Args:
+            model: The model identifier to load
+            config: Training configuration
+            provider_config: Provider-specific configuration
+        Returns:
+            tuple: (train_dataset, eval_dataset, tokenizer)
+        """
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+
+        # Load dataset
+        logger.info(f"Loading dataset: {config.data_config.dataset_id}")
+        rows = await self._setup_data(config.data_config.dataset_id)
+        if not self.validate_dataset_format(rows):
+            raise ValueError("Dataset is missing required fields: input_query, expected_answer, chat_completion_input")
+        logger.info(f"Loaded {len(rows)} rows from dataset")
+
+        # Initialize tokenizer
+        logger.info(f"Initializing tokenizer for model: {model}")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model, **provider_config.model_specific_config)
+
+            # Set pad token to eos token if not present
+            # This is common for models that don't have a dedicated pad token
+            if not tokenizer.pad_token:
+                tokenizer.pad_token = tokenizer.eos_token
+
+            # Set padding side to right for causal language modeling
+            # This ensures that padding tokens don't interfere with the model's ability
+            # to predict the next token in the sequence
+            tokenizer.padding_side = "right"
+
+            # Set truncation side to right to keep the beginning of the sequence
+            # This is important for maintaining context and instruction format
+            tokenizer.truncation_side = "right"
+
+            # Set model max length to match provider config
+            # This ensures consistent sequence lengths across the training process
+            tokenizer.model_max_length = provider_config.max_seq_length
+
+            logger.info("Tokenizer initialized successfully")
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize tokenizer: {str(e)}") from e
+
+        # Create and preprocess dataset
+        logger.info("Creating and preprocessing dataset")
+        try:
+            ds = self._create_dataset(rows, config, provider_config)
+            ds = self._preprocess_dataset(ds, tokenizer, provider_config)
+            logger.info(f"Dataset created with {len(ds)} examples")
+        except Exception as e:
+            raise ValueError(f"Failed to create dataset: {str(e)}") from e
+
+        # Split dataset
+        logger.info("Splitting dataset into train and validation sets")
+        train_val_split = ds.train_test_split(test_size=0.1, seed=42)
+        train_dataset = train_val_split["train"]
+        eval_dataset = train_val_split["test"]
+        logger.info(f"Split dataset into {len(train_dataset)} training and {len(eval_dataset)} validation examples")
+
+        return train_dataset, eval_dataset, tokenizer
+
+    def load_model(
+        self,
+        model: str,
+        device: torch.device,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> AutoModelForCausalLM:
+        """Load and initialize the model for training.
+        Args:
+            model: The model identifier to load
+            device: The device to load the model onto
+            provider_config: Provider-specific configuration
+        Returns:
+            The loaded and initialized model
+        Raises:
+            RuntimeError: If model loading fails
+        """
+        logger.info("Loading the base model")
+        try:
+            model_config = AutoConfig.from_pretrained(model, **provider_config.model_specific_config)
+            model_obj = AutoModelForCausalLM.from_pretrained(
+                model,
+                torch_dtype="auto" if device.type != "cpu" else "float32",
+                quantization_config=None,
+                config=model_config,
+                **provider_config.model_specific_config,
+            )
+            # Always move model to specified device
+            model_obj = model_obj.to(device)
+            logger.info(f"Model loaded and moved to device: {model_obj.device}")
+            return model_obj
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model: {str(e)}") from e
+
+    def setup_training_args(
+        self,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+        device: torch.device,
+        output_dir_path: Path | None,
+        steps_per_epoch: int,
+    ) -> SFTConfig:
+        """Setup training arguments.
+        Args:
+            config: Training configuration
+            provider_config: Provider-specific configuration
+            device: The device to train on
+            output_dir_path: Optional path to save the model
+            steps_per_epoch: Number of steps per epoch
+        Returns:
+            Configured SFTConfig object
+        """
+        logger.info("Configuring training arguments")
+        lr = 2e-5
+        if config.optimizer_config:
+            lr = config.optimizer_config.lr
+            logger.info(f"Using custom learning rate: {lr}")
+
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+        data_config = config.data_config
+
+        # Calculate steps
+        total_steps = steps_per_epoch * config.n_epochs
+        max_steps = min(config.max_steps_per_epoch, total_steps)
+        eval_steps = max(1, steps_per_epoch // 10)  # Evaluate 10 times per epoch
+        save_steps = max(1, steps_per_epoch // 5)  # Save 5 times per epoch
+        logging_steps = max(1, steps_per_epoch // 50)  # Log 50 times per epoch
+
+        logger.info("Training configuration:")
+        logger.info(f"- Steps per epoch: {steps_per_epoch}")
+        logger.info(f"- Total steps: {total_steps}")
+        logger.info(f"- Max steps: {max_steps}")
+        logger.info(f"- Eval steps: {eval_steps}")
+        logger.info(f"- Save steps: {save_steps}")
+        logger.info(f"- Logging steps: {logging_steps}")
+
+        # Configure save strategy
+        save_strategy = "no"
+        if output_dir_path:
+            save_strategy = "steps"
+            logger.info(f"Will save checkpoints to {output_dir_path}")
+
+        return SFTConfig(
+            max_steps=max_steps,
+            output_dir=str(output_dir_path) if output_dir_path is not None else None,
+            num_train_epochs=config.n_epochs,
+            per_device_train_batch_size=data_config.batch_size,
+            fp16=device.type == "cuda",
+            bf16=False,  # Causes CPU issues.
+            eval_strategy="steps",
+            use_cpu=True if device.type == "cpu" and not torch.backends.mps.is_available() else False,
+            save_strategy=save_strategy,
+            report_to="none",
+            max_seq_length=provider_config.max_seq_length,
+            gradient_accumulation_steps=config.gradient_accumulation_steps,
+            gradient_checkpointing=provider_config.gradient_checkpointing,
+            learning_rate=lr,
+            warmup_ratio=provider_config.warmup_ratio,
+            weight_decay=provider_config.weight_decay,
+            remove_unused_columns=False,
+            dataloader_pin_memory=provider_config.dataloader_pin_memory,
+            dataloader_num_workers=provider_config.dataloader_num_workers,
+            dataset_text_field="text",
+            packing=False,
+            load_best_model_at_end=True if output_dir_path else False,
+            metric_for_best_model="eval_loss",
+            greater_is_better=False,
+            eval_steps=eval_steps,
+            save_steps=save_steps,
+            logging_steps=logging_steps,
+        )
+
+    def save_model(
+        self,
+        model_obj: AutoModelForCausalLM,
+        trainer: SFTTrainer,
+        peft_config: LoraConfig | None,
+        output_dir_path: Path,
+    ) -> None:
+        """Save the trained model.
+        Args:
+            model_obj: The model to save
+            trainer: The trainer instance
+            peft_config: Optional LoRA configuration
+            output_dir_path: Path to save the model
+        """
+        logger.info("Saving final model")
+        model_obj.config.use_cache = True
+
+        if peft_config:
+            logger.info("Merging LoRA weights with base model")
+            model_obj = trainer.model.merge_and_unload()
+        else:
+            model_obj = trainer.model
+
+        save_path = output_dir_path / "merged_model"
+        logger.info(f"Saving model to {save_path}")
+        model_obj.save_pretrained(save_path)
+
+    async def _run_training(
+        self,
+        model: str,
+        provider_config: dict[str, Any],
+        peft_config: LoraConfig | None,
+        config: dict[str, Any],
+        output_dir_path: Path | None,
+    ) -> None:
+        """Run the training process with signal handling."""
+
+        def signal_handler(signum, frame):
+            """Handle termination signals gracefully."""
+            logger.info(f"Received signal {signum}, initiating graceful shutdown")
+            sys.exit(0)
+
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        # Convert config dicts back to objects
+        logger.info("Initializing configuration objects")
+        provider_config_obj = HuggingFacePostTrainingConfig(**provider_config)
+        config_obj = TrainingConfig(**config)
+
+        # Initialize and validate device
+        device = setup_torch_device(provider_config_obj.device)
+        logger.info(f"Using device '{device}'")
+
+        # Load dataset and tokenizer
+        train_dataset, eval_dataset, tokenizer = await self.load_dataset(model, config_obj, provider_config_obj)
+
+        # Calculate steps per epoch
+        if not config_obj.data_config:
+            raise ValueError("DataConfig is required for training")
+        steps_per_epoch = len(train_dataset) // config_obj.data_config.batch_size
+
+        # Setup training arguments
+        training_args = self.setup_training_args(
+            config_obj,
+            provider_config_obj,
+            device,
+            output_dir_path,
+            steps_per_epoch,
+        )
+
+        # Load model
+        model_obj = self.load_model(model, device, provider_config_obj)
+
+        # Initialize trainer
+        logger.info("Initializing SFTTrainer")
+        trainer = SFTTrainer(
+            model=model_obj,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            peft_config=peft_config,
+            args=training_args,
+        )
+
+        try:
+            # Train
+            logger.info("Starting training")
+            trainer.train()
+            logger.info("Training completed successfully")
+
+            # Save final model if output directory is provided
+            if output_dir_path:
+                self.save_model(model_obj, trainer, peft_config, output_dir_path)
+
+        finally:
+            # Clean up resources
+            logger.info("Cleaning up resources")
+            if hasattr(trainer, "model"):
+                evacuate_model_from_device(trainer.model, device.type)
+            del trainer
+            gc.collect()
+            logger.info("Cleanup completed")
+
+    async def train(
+        self,
+        model: str,
+        output_dir: str | None,
+        job_uuid: str,
+        lora_config: LoraFinetuningConfig,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> tuple[dict[str, Any], list[Checkpoint] | None]:
+        """Train a model using HuggingFace's SFTTrainer"""
+        # Initialize and validate device
+        device = setup_torch_device(provider_config.device)
+        logger.info(f"Using device '{device}'")
+
+        output_dir_path = None
+        if output_dir:
+            output_dir_path = Path(output_dir)
+
+        # Track memory stats
+        memory_stats = {
+            "initial": get_memory_stats(device),
+            "after_training": None,
+            "final": None,
+        }
+
+        # Configure LoRA
+        peft_config = None
+        if lora_config:
+            peft_config = LoraConfig(
+                lora_alpha=lora_config.alpha,
+                lora_dropout=0.1,
+                r=lora_config.rank,
+                bias="none",
+                task_type="CAUSAL_LM",
+                target_modules=lora_config.lora_attn_modules,
+            )
+
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+
+        # Train in a separate process
+        logger.info("Starting training in separate process")
+        try:
+            # Set multiprocessing start method to 'spawn' for CUDA/MPS compatibility
+            if device.type in ["cuda", "mps"]:
+                multiprocessing.set_start_method("spawn", force=True)
+
+            process = multiprocessing.Process(
+                target=self._run_training_sync,
+                kwargs={
+                    "model": model,
+                    "provider_config": provider_config.model_dump(),
+                    "peft_config": peft_config,
+                    "config": config.model_dump(),
+                    "output_dir_path": output_dir_path,
+                },
+            )
+            process.start()
+
+            # Monitor the process
+            while process.is_alive():
+                process.join(timeout=1)  # Check every second
+                if not process.is_alive():
+                    break
+
+            # Get the return code
+            if process.exitcode != 0:
+                raise RuntimeError(f"Training failed with exit code {process.exitcode}")
+
+            memory_stats["after_training"] = get_memory_stats(device)
+
+            checkpoints = None
+            if output_dir_path:
+                # Create checkpoint
+                checkpoint = Checkpoint(
+                    identifier=f"{model}-sft-{config.n_epochs}",
+                    created_at=datetime.now(timezone.utc),
+                    epoch=config.n_epochs,
+                    post_training_job_id=job_uuid,
+                    path=str(output_dir_path / "merged_model"),
+                )
+                checkpoints = [checkpoint]
+
+            return memory_stats, checkpoints
+        finally:
+            memory_stats["final"] = get_memory_stats(device)
+            gc.collect()
diff --git a/llama_stack/providers/inline/post_training/torchtune/__init__.py b/llama_stack/providers/inline/post_training/torchtune/__init__.py
index ca7801be7..7a2f9eba2 100644
--- a/llama_stack/providers/inline/post_training/torchtune/__init__.py
+++ b/llama_stack/providers/inline/post_training/torchtune/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -15,7 +15,7 @@ from .config import TorchtunePostTrainingConfig
 
 async def get_provider_impl(
     config: TorchtunePostTrainingConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .post_training import TorchtunePostTrainingImpl
 
diff --git a/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py b/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
index fcadd0884..af8bd2765 100644
--- a/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
@@ -8,7 +8,7 @@ import json
 import os
 import shutil
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any
 
 import torch
 from safetensors.torch import save_file
@@ -34,7 +34,7 @@ class TorchtuneCheckpointer:
         model_id: str,
         training_algorithm: str,
         checkpoint_dir: str,
-        checkpoint_files: List[str],
+        checkpoint_files: list[str],
         output_dir: str,
         model_type: str,
     ):
@@ -54,11 +54,11 @@ class TorchtuneCheckpointer:
         # get ckpt paths
         self._checkpoint_path = Path.joinpath(self._checkpoint_dir, self._checkpoint_file)
 
-    def load_checkpoint(self) -> Dict[str, Any]:
+    def load_checkpoint(self) -> dict[str, Any]:
         """
         Load Meta checkpoint from file. Currently only loading from a single file is supported.
         """
-        state_dict: Dict[str, Any] = {}
+        state_dict: dict[str, Any] = {}
         model_state_dict = safe_torch_load(self._checkpoint_path)
         if self._model_type == ModelType.LLAMA3_VISION:
             from torchtune.models.llama3_2_vision._convert_weights import (
@@ -82,7 +82,7 @@ class TorchtuneCheckpointer:
 
     def save_checkpoint(
         self,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         epoch: int,
         adapter_only: bool = False,
         checkpoint_format: str | None = None,
@@ -100,7 +100,7 @@ class TorchtuneCheckpointer:
     def _save_meta_format_checkpoint(
         self,
         model_file_path: Path,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
         adapter_only: bool = False,
     ) -> None:
         model_file_path.mkdir(parents=True, exist_ok=True)
@@ -168,7 +168,7 @@ class TorchtuneCheckpointer:
     def _save_hf_format_checkpoint(
         self,
         model_file_path: Path,
-        state_dict: Dict[str, Any],
+        state_dict: dict[str, Any],
     ) -> None:
         # the config.json file contains model params needed for state dict conversion
         config = json.loads(Path.joinpath(self._checkpoint_dir.parent, "config.json").read_text())
@@ -179,7 +179,7 @@ class TorchtuneCheckpointer:
         repo_id_path = Path.joinpath(self._checkpoint_dir.parent, REPO_ID_FNAME).with_suffix(".json")
         self.repo_id = None
         if repo_id_path.exists():
-            with open(repo_id_path, "r") as json_file:
+            with open(repo_id_path) as json_file:
                 data = json.load(json_file)
                 self.repo_id = data.get("repo_id")
 
diff --git a/llama_stack/providers/inline/post_training/torchtune/common/utils.py b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
index a040ca1b0..f0fa052a2 100644
--- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@@ -10,7 +10,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Callable, Dict
+from collections.abc import Callable
 
 import torch
 from pydantic import BaseModel
@@ -35,7 +35,7 @@ class ModelConfig(BaseModel):
     checkpoint_type: str
 
 
-MODEL_CONFIGS: Dict[str, ModelConfig] = {
+MODEL_CONFIGS: dict[str, ModelConfig] = {
     "Llama3.2-3B-Instruct": ModelConfig(
         model_definition=lora_llama3_2_3b,
         tokenizer_type=llama3_tokenizer,
@@ -48,7 +48,7 @@ MODEL_CONFIGS: Dict[str, ModelConfig] = {
     ),
 }
 
-DATA_FORMATS: Dict[str, Transform] = {
+DATA_FORMATS: dict[str, Transform] = {
     "instruct": InputOutputToMessages,
     "dialog": ShareGPTToMessages,
 }
diff --git a/llama_stack/providers/inline/post_training/torchtune/config.py b/llama_stack/providers/inline/post_training/torchtune/config.py
index ee3504f9e..f3ce874aa 100644
--- a/llama_stack/providers/inline/post_training/torchtune/config.py
+++ b/llama_stack/providers/inline/post_training/torchtune/config.py
@@ -4,17 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Literal, Optional
+from typing import Any, Literal
 
 from pydantic import BaseModel
 
 
 class TorchtunePostTrainingConfig(BaseModel):
-    torch_seed: Optional[int] = None
-    checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta"
+    torch_seed: int | None = None
+    checkpoint_format: Literal["meta", "huggingface"] | None = "meta"
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "checkpoint_format": "meta",
         }
diff --git a/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py b/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
index 6b607f1c7..96dd8b8dd 100644
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
@@ -11,7 +11,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
-from typing import Any, Mapping
+from collections.abc import Mapping
+from typing import Any
 
 from llama_stack.providers.utils.common.data_schema_validator import ColumnName
 
diff --git a/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py b/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
index 050996860..ae7faf31e 100644
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
@@ -10,7 +10,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Dict, List, Mapping
+from collections.abc import Mapping
+from typing import Any
 
 import numpy as np
 from torch.utils.data import Dataset
@@ -27,7 +28,7 @@ from llama_stack.providers.inline.post_training.torchtune.datasets.format_adapte
 class SFTDataset(Dataset):
     def __init__(
         self,
-        rows: List[Dict[str, Any]],
+        rows: list[dict[str, Any]],
         message_transform: Transform,
         model_transform: Transform,
         dataset_type: str,
@@ -40,11 +41,11 @@ class SFTDataset(Dataset):
     def __len__(self):
         return len(self._rows)
 
-    def __getitem__(self, index: int) -> Dict[str, Any]:
+    def __getitem__(self, index: int) -> dict[str, Any]:
         sample = self._rows[index]
         return self._prepare_sample(sample)
 
-    def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, Any]:
+    def _prepare_sample(self, sample: Mapping[str, Any]) -> dict[str, Any]:
         if self._dataset_type == "instruct":
             sample = llama_stack_instruct_to_torchtune_instruct(sample)
         elif self._dataset_type == "dialog":
diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py
index cc1a6a5fe..c7d8d6758 100644
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from enum import Enum
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@@ -64,7 +64,7 @@ class TorchtunePostTrainingImpl:
         )
 
     @staticmethod
-    def _resources_stats_to_artifact(resources_stats: Dict[str, Any]) -> JobArtifact:
+    def _resources_stats_to_artifact(resources_stats: dict[str, Any]) -> JobArtifact:
         return JobArtifact(
             type=TrainingArtifactType.RESOURCES_STATS.value,
             name=TrainingArtifactType.RESOURCES_STATS.value,
@@ -75,11 +75,11 @@ class TorchtunePostTrainingImpl:
         self,
         job_uuid: str,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
         model: str,
-        checkpoint_dir: Optional[str],
-        algorithm_config: Optional[AlgorithmConfig],
+        checkpoint_dir: str | None,
+        algorithm_config: AlgorithmConfig | None,
     ) -> PostTrainingJob:
         if isinstance(algorithm_config, LoraFinetuningConfig):
 
@@ -121,8 +121,8 @@ class TorchtunePostTrainingImpl:
         finetuned_model: str,
         algorithm_config: DPOAlignmentConfig,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
     ) -> PostTrainingJob: ...
 
     async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
@@ -144,7 +144,7 @@ class TorchtunePostTrainingImpl:
         return data[0] if data else None
 
     @webmethod(route="/post-training/job/status")
-    async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]:
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse | None:
         job = self._scheduler.get_job(job_uuid)
 
         match job.status:
@@ -175,6 +175,6 @@ class TorchtunePostTrainingImpl:
         self._scheduler.cancel(job_uuid)
 
     @webmethod(route="/post-training/job/artifacts")
-    async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]:
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse | None:
         job = self._scheduler.get_job(job_uuid)
         return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job))
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index 04bf86b97..f56dd2499 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -4,14 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import gc
 import logging
 import os
 import time
 from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import torch
 from torch import nn
@@ -39,7 +38,6 @@ from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
     Checkpoint,
     DataConfig,
-    EfficiencyConfig,
     LoraFinetuningConfig,
     OptimizerConfig,
     QATFinetuningConfig,
@@ -48,9 +46,7 @@ from llama_stack.apis.post_training import (
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.inline.post_training.common.validator import (
-    validate_input_dataset_schema,
-)
+from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
 from llama_stack.providers.inline.post_training.torchtune.common import utils
 from llama_stack.providers.inline.post_training.torchtune.common.checkpointer import (
     TorchtuneCheckpointer,
@@ -83,18 +79,16 @@ class LoraFinetuningSingleDevice:
         config: TorchtunePostTrainingConfig,
         job_uuid: str,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
         model: str,
-        checkpoint_dir: Optional[str],
+        checkpoint_dir: str | None,
         algorithm_config: LoraFinetuningConfig | QATFinetuningConfig | None,
         datasetio_api: DatasetIO,
         datasets_api: Datasets,
     ) -> None:
         assert isinstance(training_config.data_config, DataConfig), "DataConfig must be initialized"
 
-        assert isinstance(training_config.efficiency_config, EfficiencyConfig), "EfficiencyConfig must be initialized"
-
         self.job_uuid = job_uuid
         self.training_config = training_config
         if not isinstance(algorithm_config, LoraFinetuningConfig):
@@ -159,7 +153,7 @@ class LoraFinetuningSingleDevice:
         self.datasets_api = datasets_api
 
     async def load_checkpoint(self):
-        def get_checkpoint_files(checkpoint_dir: str) -> List[str]:
+        def get_checkpoint_files(checkpoint_dir: str) -> list[str]:
             try:
                 # List all files in the given directory
                 files = os.listdir(checkpoint_dir)
@@ -253,8 +247,8 @@ class LoraFinetuningSingleDevice:
         self,
         enable_activation_checkpointing: bool,
         enable_activation_offloading: bool,
-        base_model_state_dict: Dict[str, Any],
-        lora_weights_state_dict: Optional[Dict[str, Any]] = None,
+        base_model_state_dict: dict[str, Any],
+        lora_weights_state_dict: dict[str, Any] | None = None,
     ) -> nn.Module:
         self._lora_rank = self.algorithm_config.rank
         self._lora_alpha = self.algorithm_config.alpha
@@ -338,7 +332,7 @@ class LoraFinetuningSingleDevice:
         tokenizer: Llama3Tokenizer,
         shuffle: bool,
         batch_size: int,
-    ) -> Tuple[DistributedSampler, DataLoader]:
+    ) -> tuple[DistributedSampler, DataLoader]:
         async def fetch_rows(dataset_id: str):
             return await self.datasetio_api.iterrows(
                 dataset_id=dataset_id,
@@ -348,11 +342,9 @@ class LoraFinetuningSingleDevice:
         all_rows = await fetch_rows(dataset_id)
         rows = all_rows.data
 
-        await validate_input_dataset_schema(
-            datasets_api=self.datasets_api,
-            dataset_id=dataset_id,
-            dataset_type=self._data_format.value,
-        )
+        # TODO (xiyan): validate dataset schema
+        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
+
         data_transform = await utils.get_data_transform(self._data_format)
         ds = SFTDataset(
             rows,
@@ -435,7 +427,7 @@ class LoraFinetuningSingleDevice:
             checkpoint_format=self._checkpoint_format,
         )
 
-    async def _loss_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+    async def _loss_step(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
         # Shape [b, s], needed for the loss not the model
         labels = batch.pop("labels")
         # run model
@@ -457,7 +449,7 @@ class LoraFinetuningSingleDevice:
 
         return loss
 
-    async def train(self) -> Tuple[Dict[str, Any], List[Checkpoint]]:
+    async def train(self) -> tuple[dict[str, Any], list[Checkpoint]]:
         """
         The core training loop.
         """
@@ -469,7 +461,7 @@ class LoraFinetuningSingleDevice:
 
         # training artifacts
         checkpoints = []
-        memory_stats: Dict[str, Any] = {}
+        memory_stats: dict[str, Any] = {}
 
         # self.epochs_run should be non-zero when we're resuming from a checkpoint
         for curr_epoch in range(self.epochs_run, self.total_epochs):
@@ -562,15 +554,11 @@ class LoraFinetuningSingleDevice:
             checkpoints.append(checkpoint)
 
         # clean up the memory after training finishes
-        if self._device.type != "cpu":
-            self._model.to("cpu")
-            torch.cuda.empty_cache()
-        del self._model
-        gc.collect()
+        evacuate_model_from_device(self._model, self._device.type)
 
         return (memory_stats, checkpoints)
 
-    async def validation(self) -> Tuple[float, float]:
+    async def validation(self) -> tuple[float, float]:
         total_loss = 0.0
         total_tokens = 0
         log.info("Starting validation...")
diff --git a/llama_stack/providers/inline/safety/code_scanner/__init__.py b/llama_stack/providers/inline/safety/code_scanner/__init__.py
index 62975a963..68e32b747 100644
--- a/llama_stack/providers/inline/safety/code_scanner/__init__.py
+++ b/llama_stack/providers/inline/safety/code_scanner/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from .config import CodeScannerConfig
 
 
-async def get_provider_impl(config: CodeScannerConfig, deps: Dict[str, Any]):
+async def get_provider_impl(config: CodeScannerConfig, deps: dict[str, Any]):
     from .code_scanner import MetaReferenceCodeScannerSafetyImpl
 
     impl = MetaReferenceCodeScannerSafetyImpl(config, deps)
diff --git a/llama_stack/providers/inline/safety/code_scanner/code_scanner.py b/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
index 606d11d2c..be05ee436 100644
--- a/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
+++ b/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import logging
-from typing import Any, Dict, List
+from typing import Any
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import (
@@ -48,8 +48,8 @@ class MetaReferenceCodeScannerSafetyImpl(Safety):
     async def run_shield(
         self,
         shield_id: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
+        messages: list[Message],
+        params: dict[str, Any] = None,
     ) -> RunShieldResponse:
         shield = await self.shield_store.get_shield(shield_id)
         if not shield:
diff --git a/llama_stack/providers/inline/safety/code_scanner/config.py b/llama_stack/providers/inline/safety/code_scanner/config.py
index 1d880ee9c..66eb8e368 100644
--- a/llama_stack/providers/inline/safety/code_scanner/config.py
+++ b/llama_stack/providers/inline/safety/code_scanner/config.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class CodeScannerConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/inline/safety/llama_guard/__init__.py b/llama_stack/providers/inline/safety/llama_guard/__init__.py
index a4263b169..8865cc344 100644
--- a/llama_stack/providers/inline/safety/llama_guard/__init__.py
+++ b/llama_stack/providers/inline/safety/llama_guard/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from .config import LlamaGuardConfig
 
 
-async def get_provider_impl(config: LlamaGuardConfig, deps: Dict[str, Any]):
+async def get_provider_impl(config: LlamaGuardConfig, deps: dict[str, Any]):
     from .llama_guard import LlamaGuardSafetyImpl
 
     assert isinstance(config, LlamaGuardConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/inline/safety/llama_guard/config.py b/llama_stack/providers/inline/safety/llama_guard/config.py
index 53849ab33..412e7218d 100644
--- a/llama_stack/providers/inline/safety/llama_guard/config.py
+++ b/llama_stack/providers/inline/safety/llama_guard/config.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class LlamaGuardConfig(BaseModel):
-    excluded_categories: List[str] = []
+    excluded_categories: list[str] = []
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "excluded_categories": [],
         }
diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
index 2ab16f986..937301c2e 100644
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@@ -6,7 +6,7 @@
 
 import re
 from string import Template
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
@@ -149,8 +149,8 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
     async def run_shield(
         self,
         shield_id: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
+        messages: list[Message],
+        params: dict[str, Any] = None,
     ) -> RunShieldResponse:
         shield = await self.shield_store.get_shield(shield_id)
         if not shield:
@@ -177,7 +177,7 @@ class LlamaGuardShield:
         self,
         model: str,
         inference_api: Inference,
-        excluded_categories: Optional[List[str]] = None,
+        excluded_categories: list[str] | None = None,
     ):
         if excluded_categories is None:
             excluded_categories = []
@@ -193,7 +193,7 @@ class LlamaGuardShield:
         self.inference_api = inference_api
         self.excluded_categories = excluded_categories
 
-    def check_unsafe_response(self, response: str) -> Optional[str]:
+    def check_unsafe_response(self, response: str) -> str | None:
         match = re.match(r"^unsafe\n(.*)$", response)
         if match:
             # extracts the unsafe code
@@ -202,7 +202,7 @@ class LlamaGuardShield:
 
         return None
 
-    def get_safety_categories(self) -> List[str]:
+    def get_safety_categories(self) -> list[str]:
         excluded_categories = self.excluded_categories
         if set(excluded_categories) == set(SAFETY_CATEGORIES_TO_CODE_MAP.values()):
             excluded_categories = []
@@ -218,7 +218,7 @@ class LlamaGuardShield:
 
         return final_categories
 
-    def validate_messages(self, messages: List[Message]) -> None:
+    def validate_messages(self, messages: list[Message]) -> None:
         if len(messages) == 0:
             raise ValueError("Messages must not be empty")
         if messages[0].role != Role.user.value:
@@ -229,7 +229,7 @@ class LlamaGuardShield:
 
         return messages
 
-    async def run(self, messages: List[Message]) -> RunShieldResponse:
+    async def run(self, messages: list[Message]) -> RunShieldResponse:
         messages = self.validate_messages(messages)
 
         if self.model == CoreModelId.llama_guard_3_11b_vision.value:
@@ -247,10 +247,10 @@ class LlamaGuardShield:
         content = content.strip()
         return self.get_shield_response(content)
 
-    def build_text_shield_input(self, messages: List[Message]) -> UserMessage:
+    def build_text_shield_input(self, messages: list[Message]) -> UserMessage:
         return UserMessage(content=self.build_prompt(messages))
 
-    def build_vision_shield_input(self, messages: List[Message]) -> UserMessage:
+    def build_vision_shield_input(self, messages: list[Message]) -> UserMessage:
         conversation = []
         most_recent_img = None
 
@@ -284,7 +284,7 @@ class LlamaGuardShield:
 
         return UserMessage(content=prompt)
 
-    def build_prompt(self, messages: List[Message]) -> str:
+    def build_prompt(self, messages: list[Message]) -> str:
         categories = self.get_safety_categories()
         categories_str = "\n".join(categories)
         conversations_str = "\n\n".join(
diff --git a/llama_stack/providers/inline/safety/prompt_guard/__init__.py b/llama_stack/providers/inline/safety/prompt_guard/__init__.py
index 747f34421..1761c9138 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/__init__.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
-from .config import PromptGuardConfig  # noqa: F401
+from .config import PromptGuardConfig
 
 
-async def get_provider_impl(config: PromptGuardConfig, deps: Dict[str, Any]):
+async def get_provider_impl(config: PromptGuardConfig, deps: dict[str, Any]):
     from .prompt_guard import PromptGuardSafetyImpl
 
     impl = PromptGuardSafetyImpl(config, deps)
diff --git a/llama_stack/providers/inline/safety/prompt_guard/config.py b/llama_stack/providers/inline/safety/prompt_guard/config.py
index 76bd5978d..69ea512c5 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/config.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, field_validator
 
@@ -26,7 +26,7 @@ class PromptGuardConfig(BaseModel):
         return v
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "guard_type": "injection",
         }
diff --git a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
index fce3e3d14..ff87889ea 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import logging
-from typing import Any, Dict, List
+from typing import Any
 
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
@@ -49,8 +49,8 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
     async def run_shield(
         self,
         shield_id: str,
-        messages: List[Message],
-        params: Dict[str, Any] = None,
+        messages: list[Message],
+        params: dict[str, Any] = None,
     ) -> RunShieldResponse:
         shield = await self.shield_store.get_shield(shield_id)
         if not shield:
@@ -75,13 +75,15 @@ class PromptGuardShield:
         self.temperature = temperature
         self.threshold = threshold
 
-        self.device = "cuda"
+        self.device = "cpu"
+        if torch.cuda.is_available():
+            self.device = "cuda"
 
         # load model and tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
         self.model = AutoModelForSequenceClassification.from_pretrained(model_dir, device_map=self.device)
 
-    async def run(self, messages: List[Message]) -> RunShieldResponse:
+    async def run(self, messages: list[Message]) -> RunShieldResponse:
         message = messages[-1]
         text = interleaved_content_as_str(message.content)
 
diff --git a/llama_stack/providers/inline/scoring/basic/__init__.py b/llama_stack/providers/inline/scoring/basic/__init__.py
index 4898b973a..d9d150b1a 100644
--- a/llama_stack/providers/inline/scoring/basic/__init__.py
+++ b/llama_stack/providers/inline/scoring/basic/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -12,7 +12,7 @@ from .config import BasicScoringConfig
 
 async def get_provider_impl(
     config: BasicScoringConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .scoring import BasicScoringImpl
 
diff --git a/llama_stack/providers/inline/scoring/basic/config.py b/llama_stack/providers/inline/scoring/basic/config.py
index 5866be359..e9c7fb451 100644
--- a/llama_stack/providers/inline/scoring/basic/config.py
+++ b/llama_stack/providers/inline/scoring/basic/config.py
@@ -3,12 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class BasicScoringConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py
index 9a45f7139..09f89be5e 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@@ -66,7 +66,7 @@ class BasicScoringImpl(
 
     async def shutdown(self) -> None: ...
 
-    async def list_scoring_functions(self) -> List[ScoringFn]:
+    async def list_scoring_functions(self) -> list[ScoringFn]:
         scoring_fn_defs_list = [
             fn_def for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs()
         ]
@@ -82,7 +82,7 @@ class BasicScoringImpl(
     async def score_batch(
         self,
         dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
         save_results_dataset: bool = False,
     ) -> ScoreBatchResponse:
         dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
@@ -107,8 +107,8 @@ class BasicScoringImpl(
 
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
+        input_rows: list[dict[str, Any]],
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
     ) -> ScoreResponse:
         res = {}
         for scoring_fn_id in scoring_functions.keys():
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
index f37780f3e..b29620be2 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@@ -6,7 +6,7 @@
 
 import json
 import re
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -17,7 +17,7 @@ from ..utils.bfcl.checker import ast_checker, is_empty_output
 from .fn_defs.bfcl import bfcl
 
 
-def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
+def postprocess(x: dict[str, Any], test_category: str) -> dict[str, Any]:
     contain_func_call = False
     error = None
     error_type = None
@@ -52,11 +52,11 @@ def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
     }
 
 
-def gen_valid(x: Dict[str, Any]) -> Dict[str, float]:
+def gen_valid(x: dict[str, Any]) -> dict[str, float]:
     return {"valid": x["valid"]}
 
 
-def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
+def gen_relevance_acc(x: dict[str, Any]) -> dict[str, float]:
     # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
     # If `test_category` is "irrelevance", the model is expected to output no function call.
     # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
@@ -78,9 +78,9 @@ class BFCLScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = "bfcl",
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = "bfcl",
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
         score_result = postprocess(input_row, test_category)
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
index 84ca55732..b87974d08 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
@@ -6,7 +6,7 @@
 
 import json
 import re
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -228,9 +228,9 @@ class DocVQAScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = "docvqa",
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = "docvqa",
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         expected_answers = json.loads(input_row["expected_answer"])
         generated_answer = input_row["generated_answer"]
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
index 0bd6bdd48..60804330f 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -26,9 +26,9 @@ class EqualityScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = "equality",
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = "equality",
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         assert "expected_answer" in input_row, "Expected answer not found in input row."
         assert "generated_answer" in input_row, "Generated answer not found in input row."
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
index 6ff856684..77f6176e6 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -28,9 +28,9 @@ class IfEvalScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
 
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
index d6c78a9ac..d765959a8 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
@@ -28,9 +28,9 @@ class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         assert scoring_fn_identifier is not None, "Scoring function identifier not found."
         fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
index 0606a9581..cb336e303 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import re
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
@@ -28,9 +28,9 @@ class RegexParserScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         assert scoring_fn_identifier is not None, "Scoring function identifier not found."
         fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
index 71defc433..d6e10e6c9 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -26,9 +26,9 @@ class SubsetOfScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = "subset_of",
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = "subset_of",
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         expected_answer = input_row["expected_answer"]
         generated_answer = input_row["generated_answer"]
diff --git a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
index 28605159f..b74c3826e 100644
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
@@ -11,8 +11,8 @@ import logging
 import random
 import re
 import string
+from collections.abc import Iterable, Sequence
 from types import MappingProxyType
-from typing import Dict, Iterable, List, Optional, Sequence, Union
 
 import emoji
 import langdetect
@@ -1673,12 +1673,11 @@ def split_chinese_japanese_hindi(lines: str) -> Iterable[str]:
     The separator for hindi is '।'
     """
     for line in lines.splitlines():
-        for sent in re.findall(
+        yield from re.findall(
             r"[^!?。\.\!\?\!\?\.\n।]+[!?。\.\!\?\!\?\.\n।]?",
             line.strip(),
             flags=re.U,
-        ):
-            yield sent
+        )
 
 
 def count_words_cjk(text: str) -> int:
@@ -1707,7 +1706,7 @@ def count_words_cjk(text: str) -> int:
     return non_asian_words_cnt + asian_chars_cnt + emoji_cnt
 
 
-@functools.lru_cache(maxsize=None)
+@functools.cache
 def _get_sentence_tokenizer():
     return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
 
@@ -1719,8 +1718,8 @@ def count_sentences(text):
     return len(tokenized_sentences)
 
 
-def get_langid(text: str, lid_path: Optional[str] = None) -> str:
-    line_langs: List[str] = []
+def get_langid(text: str, lid_path: str | None = None) -> str:
+    line_langs: list[str] = []
     lines = [line.strip() for line in text.split("\n") if len(line.strip()) >= 4]
 
     for line in lines:
@@ -1741,7 +1740,7 @@ def generate_keywords(num_keywords):
 
 
 """Library of instructions"""
-_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
+_InstructionArgsDtype = dict[str, int | str | Sequence[str]] | None
 
 _LANGUAGES = LANGUAGE_CODES
 
diff --git a/llama_stack/providers/inline/scoring/basic/utils/math_utils.py b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
index e11fc625b..6840aad14 100644
--- a/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import re
-from typing import Sequence
+from collections.abc import Sequence
 
 from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit
 
@@ -323,7 +323,7 @@ def _fix_a_slash_b(string: str) -> str:
     try:
         ia = int(a)
         ib = int(b)
-        assert string == "{}/{}".format(ia, ib)
+        assert string == f"{ia}/{ib}"
         new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}"
         return new_string
     except (ValueError, AssertionError):
diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py
index f1b0112d9..8ea6e9b96 100644
--- a/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -18,7 +18,7 @@ class BraintrustProviderDataValidator(BaseModel):
 
 async def get_provider_impl(
     config: BraintrustScoringConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .braintrust import BraintrustScoringImpl
 
diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
index 3fae83340..d6655d657 100644
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from autoevals.llm import Factuality
 from autoevals.ragas import (
@@ -132,7 +132,7 @@ class BraintrustScoringImpl(
 
     async def shutdown(self) -> None: ...
 
-    async def list_scoring_functions(self) -> List[ScoringFn]:
+    async def list_scoring_functions(self) -> list[ScoringFn]:
         scoring_fn_defs_list = list(self.supported_fn_defs_registry.values())
         for f in scoring_fn_defs_list:
             assert f.identifier.startswith("braintrust"), (
@@ -159,7 +159,7 @@ class BraintrustScoringImpl(
     async def score_batch(
         self,
         dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
+        scoring_functions: dict[str, ScoringFnParams | None],
         save_results_dataset: bool = False,
     ) -> ScoreBatchResponse:
         await self.set_api_key()
@@ -181,9 +181,7 @@ class BraintrustScoringImpl(
             results=res.results,
         )
 
-    async def score_row(
-        self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None
-    ) -> ScoringResultRow:
+    async def score_row(self, input_row: dict[str, Any], scoring_fn_identifier: str | None = None) -> ScoringResultRow:
         validate_row_schema(input_row, get_valid_schemas(Api.scoring.value))
         await self.set_api_key()
         assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
@@ -203,8 +201,8 @@ class BraintrustScoringImpl(
 
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: dict[str, ScoringFnParams | None],
     ) -> ScoreResponse:
         await self.set_api_key()
         res = {}
diff --git a/llama_stack/providers/inline/scoring/braintrust/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py
index d4e0d9bcd..4a80f1e4f 100644
--- a/llama_stack/providers/inline/scoring/braintrust/config.py
+++ b/llama_stack/providers/inline/scoring/braintrust/config.py
@@ -3,19 +3,19 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
 
 class BraintrustScoringConfig(BaseModel):
-    openai_api_key: Optional[str] = Field(
+    openai_api_key: str | None = Field(
         default=None,
         description="The OpenAI API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "openai_api_key": "${env.OPENAI_API_KEY:}",
         }
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
index 4a83bfe13..88bf10737 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -12,7 +12,7 @@ from .config import LlmAsJudgeScoringConfig
 
 async def get_provider_impl(
     config: LlmAsJudgeScoringConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .scoring import LlmAsJudgeScoringImpl
 
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/config.py b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
index ff63fc5e7..b150ef54c 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
@@ -3,12 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class LlmAsJudgeScoringConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
index 7f004fbb6..b705cb9b3 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
@@ -50,7 +50,7 @@ class LlmAsJudgeScoringImpl(
 
     async def shutdown(self) -> None: ...
 
-    async def list_scoring_functions(self) -> List[ScoringFn]:
+    async def list_scoring_functions(self) -> list[ScoringFn]:
         scoring_fn_defs_list = self.llm_as_judge_fn.get_supported_scoring_fn_defs()
 
         for f in self.llm_as_judge_fn.get_supported_scoring_fn_defs():
@@ -66,7 +66,7 @@ class LlmAsJudgeScoringImpl(
     async def score_batch(
         self,
         dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
         save_results_dataset: bool = False,
     ) -> ScoreBatchResponse:
         dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
@@ -91,8 +91,8 @@ class LlmAsJudgeScoringImpl(
 
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
+        input_rows: list[dict[str, Any]],
+        scoring_functions: dict[str, ScoringFnParams | None] = None,
     ) -> ScoreResponse:
         res = {}
         for scoring_fn_id in scoring_functions.keys():
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
index f4e8ab0aa..51cdf6c3f 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import re
-from typing import Any, Dict, Optional
+from typing import Any
 
 from llama_stack.apis.inference.inference import Inference, UserMessage
 from llama_stack.apis.scoring import ScoringResultRow
@@ -30,9 +30,9 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
 
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         assert scoring_fn_identifier is not None, "Scoring function identifier not found."
         fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py
index 23468c5d0..09e97136a 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -13,7 +13,7 @@ from .config import TelemetryConfig, TelemetrySink
 __all__ = ["TelemetryConfig", "TelemetrySink"]
 
 
-async def get_provider_impl(config: TelemetryConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: TelemetryConfig, deps: dict[Api, Any]):
     from .telemetry import TelemetryAdapter
 
     impl = TelemetryAdapter(config, deps)
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/config.py b/llama_stack/providers/inline/telemetry/meta_reference/config.py
index 57312f41f..af53bfd9c 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List
+from typing import Any
 
 from pydantic import BaseModel, Field, field_validator
 
@@ -30,10 +30,10 @@ class TelemetryConfig(BaseModel):
     )
     service_name: str = Field(
         # service name is always the same, use zero-width space to avoid clutter
-        default="​",
+        default="",
         description="The service name to use for telemetry",
     )
-    sinks: List[TelemetrySink] = Field(
+    sinks: list[TelemetrySink] = Field(
         default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
         description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
     )
@@ -50,9 +50,9 @@ class TelemetryConfig(BaseModel):
         return v
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> dict[str, Any]:
         return {
-            "service_name": "${env.OTEL_SERVICE_NAME:​}",
+            "service_name": "${env.OTEL_SERVICE_NAME:}",
             "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
-            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
+            "sqlite_db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
         }
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
index b909d32ef..ff1914c15 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@@ -78,7 +78,7 @@ class ConsoleSpanProcessor(SpanProcessor):
 
             severity = event.attributes.get("severity", "info")
             message = event.attributes.get("message", event.name)
-            if isinstance(message, (dict, list)):
+            if isinstance(message, dict | list):
                 message = json.dumps(message, indent=2)
 
             severity_colors = {
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index 9b23c8229..0f6cf8619 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import threading
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from opentelemetry import metrics, trace
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
@@ -16,11 +16,15 @@ from opentelemetry.sdk.resources import Resource
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.semconv.resource import ResourceAttributes
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
 
 from llama_stack.apis.telemetry import (
     Event,
     MetricEvent,
+    MetricLabelMatcher,
+    MetricQueryType,
     QueryCondition,
+    QueryMetricsResponse,
     QuerySpanTreeResponse,
     QueryTracesResponse,
     Span,
@@ -41,6 +45,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor
 )
 from llama_stack.providers.utils.telemetry.dataset_mixin import TelemetryDatasetMixin
 from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore
+from llama_stack.providers.utils.telemetry.tracing import ROOT_SPAN_MARKERS
 
 from .config import TelemetryConfig, TelemetrySink
 
@@ -60,7 +65,7 @@ def is_tracing_enabled(tracer):
 
 
 class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
-    def __init__(self, config: TelemetryConfig, deps: Dict[Api, Any]) -> None:
+    def __init__(self, config: TelemetryConfig, deps: dict[Api, Any]) -> None:
         self.config = config
         self.datasetio_api = deps.get(Api.datasetio)
         self.meter = None
@@ -123,6 +128,17 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
         else:
             raise ValueError(f"Unknown event type: {event}")
 
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: int,
+        end_time: int | None = None,
+        granularity: str | None = "1d",
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse:
+        raise NotImplementedError("Querying metrics is not implemented")
+
     def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
         with self._lock:
             # Use global storage instead of instance storage
@@ -132,7 +148,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
             if span:
                 timestamp_ns = int(event.timestamp.timestamp() * 1e9)
                 span.add_event(
-                    name=event.type,
+                    name=event.type.value,
                     attributes={
                         "message": event.message,
                         "severity": event.severity.value,
@@ -192,6 +208,15 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                 event.attributes = {}
             event.attributes["__ttl__"] = ttl_seconds
 
+            # Extract these W3C trace context attributes so they are not written to
+            # underlying storage, as we just need them to propagate the trace context.
+            traceparent = event.attributes.pop("traceparent", None)
+            tracestate = event.attributes.pop("tracestate", None)
+            if traceparent:
+                # If we have a traceparent header value, we're not the root span.
+                for root_attribute in ROOT_SPAN_MARKERS:
+                    event.attributes.pop(root_attribute, None)
+
             if isinstance(event.payload, SpanStartPayload):
                 # Check if span already exists to prevent duplicates
                 if span_id in _GLOBAL_STORAGE["active_spans"]:
@@ -202,8 +227,12 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                     parent_span_id = int(event.payload.parent_span_id, 16)
                     parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
                     context = trace.set_span_in_context(parent_span)
-                else:
-                    event.attributes["__root_span__"] = "true"
+                elif traceparent:
+                    carrier = {
+                        "traceparent": traceparent,
+                        "tracestate": tracestate,
+                    }
+                    context = TraceContextTextMapPropagator().extract(carrier=carrier)
 
                 span = tracer.start_span(
                     name=event.payload.name,
@@ -231,10 +260,10 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
 
     async def query_traces(
         self,
-        attribute_filters: Optional[List[QueryCondition]] = None,
-        limit: Optional[int] = 100,
-        offset: Optional[int] = 0,
-        order_by: Optional[List[str]] = None,
+        attribute_filters: list[QueryCondition] | None = None,
+        limit: int | None = 100,
+        offset: int | None = 0,
+        order_by: list[str] | None = None,
     ) -> QueryTracesResponse:
         return QueryTracesResponse(
             data=await self.trace_store.query_traces(
@@ -254,8 +283,8 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
     async def get_span_tree(
         self,
         span_id: str,
-        attributes_to_return: Optional[List[str]] = None,
-        max_depth: Optional[int] = None,
+        attributes_to_return: list[str] | None = None,
+        max_depth: int | None = None,
     ) -> QuerySpanTreeResponse:
         return QuerySpanTreeResponse(
             data=await self.trace_store.get_span_tree(
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py
deleted file mode 100644
index 8317ce793..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict
-
-from .config import CodeInterpreterToolConfig
-
-__all__ = ["CodeInterpreterToolConfig", "CodeInterpreterToolRuntimeImpl"]
-
-
-async def get_provider_impl(config: CodeInterpreterToolConfig, _deps: Dict[str, Any]):
-    from .code_interpreter import CodeInterpreterToolRuntimeImpl
-
-    impl = CodeInterpreterToolRuntimeImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_env_prefix.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_env_prefix.py
deleted file mode 100644
index 9c5f642ea..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_env_prefix.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import errno
-
-# Disabling potentially dangerous functions
-import os as _os
-from functools import partial
-
-os_funcs_to_disable = [
-    "kill",
-    "system",
-    "putenv",
-    "remove",
-    "removedirs",
-    "rmdir",
-    "fchdir",
-    "setuid",
-    "fork",
-    "forkpty",
-    "killpg",
-    "rename",
-    "renames",
-    "truncate",
-    "replace",
-    # "unlink",  # Commenting as this was blocking matpltlib from rendering plots correctly
-    "fchmod",
-    "fchown",
-    "chmod",
-    "chown",
-    "chroot",
-    "fchdir",
-    "lchflags",
-    "lchmod",
-    "lchown",
-    "chdir",
-]
-
-
-def call_not_allowed(*args, **kwargs):
-    raise OSError(errno.EPERM, "Call are not permitted in this environment")
-
-
-for func_name in os_funcs_to_disable:
-    if hasattr(_os, func_name):
-        setattr(_os, func_name, partial(call_not_allowed, _func_name=f"os.{func_name}"))
-
-import shutil as _shutil
-
-for func_name in ["rmtree", "move", "chown"]:
-    if hasattr(_shutil, func_name):
-        setattr(
-            _shutil,
-            func_name,
-            partial(call_not_allowed, _func_name=f"shutil.{func_name}"),
-        )
-
-import subprocess as _subprocess
-
-
-def popen_not_allowed(*args, **kwargs):
-    raise _subprocess.CalledProcessError(
-        -1,
-        args[0] if args else "unknown",
-        stderr="subprocess.Popen is not allowed in this environment",
-    )
-
-
-_subprocess.Popen = popen_not_allowed  # type: ignore
-
-
-import atexit as _atexit
-import builtins as _builtins
-import io as _io
-import json as _json
-import sys as _sys
-
-# NB! The following "unused" imports crucial, make sure not not to remove
-# them with linters - they're used in code_execution.py
-from contextlib import (  # noqa
-    contextmanager as _contextmanager,
-)
-from multiprocessing.connection import Connection as _Connection
-
-# Mangle imports to avoid polluting model execution namespace.
-
-_IO_SINK = _io.StringIO()
-_NETWORK_TIMEOUT = 5
-_NETWORK_CONNECTIONS = None
-
-
-def _open_connections():
-    global _NETWORK_CONNECTIONS
-    if _NETWORK_CONNECTIONS is not None:
-        # Ensure connections only opened once.
-        return _NETWORK_CONNECTIONS
-    req_w_fd, resp_r_fd = _sys.argv[1], _sys.argv[2]
-    req_con = _Connection(int(req_w_fd), readable=False)
-    resp_con = _Connection(int(resp_r_fd), writable=False)
-    _NETWORK_CONNECTIONS = (req_con, resp_con)
-    return _NETWORK_CONNECTIONS
-
-
-_builtins._open_connections = _open_connections  # type: ignore
-
-
-@_atexit.register
-def _close_connections():
-    global _NETWORK_CONNECTIONS
-    if _NETWORK_CONNECTIONS is None:
-        return
-    for con in _NETWORK_CONNECTIONS:
-        con.close()
-    del _NETWORK_CONNECTIONS
-
-
-def _network_call(request):
-    # NOTE: We communicate with the parent process in json, encoded
-    # in raw bytes. We do this because native send/recv methods use
-    # pickle which involves execution of arbitrary code.
-    _open_connections()
-    req_con, resp_con = _NETWORK_CONNECTIONS
-
-    req_con.send_bytes(_json.dumps(request).encode("utf-8"))
-    if resp_con.poll(timeout=_NETWORK_TIMEOUT) is None:
-        raise Exception(f"Network request timed out: {_json.dumps(request)}")
-    else:
-        return _json.loads(resp_con.recv_bytes().decode("utf-8"))
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
deleted file mode 100644
index 6106cf741..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import json
-import multiprocessing
-import os
-import re
-import subprocess
-import sys
-import tempfile
-import textwrap
-import time
-from dataclasses import dataclass
-from datetime import datetime
-from io import BytesIO
-from pathlib import Path
-from typing import List
-
-from PIL import Image
-
-from .utils import get_code_env_prefix
-
-TOOLS_ATTACHMENT_KEY = "__tools_attachment__"
-TOOLS_ATTACHMENT_KEY_REGEX = re.compile(r"__tools_attachment__=(\{.*?\})")
-
-DIRNAME = Path(__file__).parent
-
-CODE_EXEC_TIMEOUT = 20
-CODE_ENV_PREFIX = get_code_env_prefix()
-
-STDOUTERR_SINK_WRAPPER_TEMPLATE = """\
-with _redirect_stdout(_IO_SINK), _redirect_stderr(_IO_SINK):
-{code}\
-"""
-
-TRYEXCEPT_WRAPPER_TEMPLATE = """\
-try:
-{code}
-except:
-    pass\
-"""
-
-
-def generate_bwrap_command(bind_dirs: List[str]) -> str:
-    """
-    Generate the bwrap command string for binding all
-    directories in the current directory read-only.
-    """
-    bwrap_args = ""
-    bwrap_args += "--ro-bind / / "
-    # Add the --dev flag to mount device files
-    bwrap_args += "--dev /dev "
-    for d in bind_dirs:
-        bwrap_args += f"--bind {d} {d} "
-
-    # Add the --unshare-all flag to isolate the sandbox from the rest of the system
-    bwrap_args += "--unshare-all "
-    # Add the --die-with-parent flag to ensure the child process dies when bwrap's parent dies
-    bwrap_args += "--die-with-parent "
-    return bwrap_args
-
-
-@dataclass
-class CodeExecutionContext:
-    matplotlib_dump_dir: str
-
-
-@dataclass
-class CodeExecutionRequest:
-    scripts: List[str]
-    only_last_cell_stdouterr: bool = True
-    only_last_cell_fail: bool = True
-    seed: int = 0
-    strip_fpaths_in_stderr: bool = True
-    use_bwrap: bool = True
-
-
-class CodeExecutor:
-    def __init__(self, context: CodeExecutionContext):
-        self.context = context
-
-    def execute(self, req: CodeExecutionRequest) -> dict:
-        scripts = req.scripts
-        for i in range(len(scripts) - 1):
-            if req.only_last_cell_stdouterr:
-                scripts[i] = STDOUTERR_SINK_WRAPPER_TEMPLATE.format(code=textwrap.indent(scripts[i], " " * 4))
-            if req.only_last_cell_fail:
-                scripts[i] = TRYEXCEPT_WRAPPER_TEMPLATE.format(code=textwrap.indent(scripts[i], " " * 4))
-
-        # Seeds prefix:
-        seed = req.seed
-        seeds_prefix = f"""\
-def _set_seeds():
-    import random
-    random.seed({seed})
-    import numpy as np
-    np.random.seed({seed})
-_set_seeds()\
-"""
-
-        script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
-        with tempfile.TemporaryDirectory() as dpath:
-            code_fpath = os.path.join(dpath, "code.py")
-            with open(code_fpath, "w") as f:
-                f.write(script)
-
-            try:
-                python_path = os.environ.get("PYTHONPATH", "")
-                env = dict(
-                    os.environ,
-                    PYTHONHASHSEED=str(seed),
-                    MPLCONFIGDIR=dpath,
-                    MPLBACKEND="module://matplotlib_custom_backend",
-                    PYTHONPATH=f"{DIRNAME}:{python_path}",
-                )
-
-                if req.use_bwrap:
-                    bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
-                    cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
-                else:
-                    cmd = [sys.executable, "-c", script]
-
-                stdout, stderr, returncode = do_subprocess(
-                    cmd=cmd,
-                    env=env,
-                    ctx=self.context,
-                )
-
-                stderr = stderr.strip()
-                if req.strip_fpaths_in_stderr:
-                    pattern = r'File "([^"]+)", line (\d+)'
-                    stderr = re.sub(pattern, r"line \2", stderr)
-
-                return {
-                    "process_status": "completed",
-                    "returncode": returncode,
-                    "stdout": stdout.strip(),
-                    "stderr": stderr,
-                }
-
-            except subprocess.TimeoutExpired:
-                return {
-                    "process_status": "timeout",
-                    "stdout": "Timed out",
-                    "stderr": "Timed out",
-                }
-
-            except Exception as e:
-                return {
-                    "process_status": "error",
-                    "error_type": type(e).__name__,
-                    "stderr": str(e),
-                    "stdout": str(e),
-                }
-
-
-def process_matplotlib_response(response, matplotlib_dump_dir: str):
-    image_data = response["image_data"]
-    # Convert the base64 string to a bytes object
-    images_raw = [base64.b64decode(d["image_base64"]) for d in image_data]
-    # Create a list of PIL images from the bytes objects
-    images = [Image.open(BytesIO(img)) for img in images_raw]
-    # Create a list of image paths
-    image_paths = []
-    for i, img in enumerate(images):
-        # create new directory for each day to better organize data:
-        dump_dname = datetime.today().strftime("%Y-%m-%d")  # noqa: DTZ002 - we don't care about timezones here since we are displaying the date
-        dump_dpath = Path(matplotlib_dump_dir, dump_dname)
-        dump_dpath.mkdir(parents=True, exist_ok=True)
-        # save image into a file
-        dump_fname = f"matplotlib_{str(time.time()).replace('.', '_')}_{i}.png"
-        dump_fpath = dump_dpath / dump_fname
-        img.save(dump_fpath, "PNG")
-        image_paths.append(str(dump_fpath))
-
-    # this is kind of convoluted, we send back this response to the subprocess which
-    # prints it out
-    info = {
-        "filepath": str(image_paths[-1]),
-        "mimetype": "image/png",
-    }
-    return f"{TOOLS_ATTACHMENT_KEY}={json.dumps(info)}"
-
-
-def execute_subprocess_request(request, ctx: CodeExecutionContext):
-    "Route requests from the subprocess (via network Pipes) to the internet/tools."
-    if request["type"] == "matplotlib":
-        return process_matplotlib_response(request, ctx.matplotlib_dump_dir)
-    else:
-        raise Exception(f"Unrecognised network request type: {request['type']}")
-
-
-def do_subprocess(*, cmd: list, env: dict, ctx: CodeExecutionContext):
-    # Create Pipes to be used for any external tool/network requests.
-    req_r, req_w = multiprocessing.Pipe(duplex=False)
-    resp_r, resp_w = multiprocessing.Pipe(duplex=False)
-
-    cmd += [str(req_w.fileno()), str(resp_r.fileno())]
-    proc = subprocess.Popen(
-        cmd,
-        pass_fds=(req_w.fileno(), resp_r.fileno()),
-        text=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        close_fds=True,
-        env=env,
-    )
-
-    # Close unnecessary fds.
-    req_w.close()
-    resp_r.close()
-
-    pipe_close = False
-    done_read = False
-    start = time.monotonic()
-    while proc.poll() is None and not pipe_close:
-        if req_r.poll(0.1):
-            # NB: Python pipe semantics for poll and recv mean that
-            # poll() returns True is a pipe is closed.
-            # CF old school PEP from '09
-            #  https://bugs.python.org/issue5573
-            try:
-                request = json.loads(req_r.recv_bytes().decode("utf-8"))
-                response = execute_subprocess_request(request, ctx)
-
-                resp_w.send_bytes(json.dumps(response).encode("utf-8"))
-            except EOFError:
-                # The request pipe is closed - set a marker to exit
-                # after the next attempt at reading stdout/stderr.
-                pipe_close = True
-
-            try:
-                # If lots has been printed, pipe might be full but
-                # proc cannot exit until all the stdout/stderr
-                # been written/read.
-                stdout, stderr = proc.communicate(timeout=0.3)
-                done_read = True
-            except subprocess.TimeoutExpired:
-                # The program has not terminated. Ignore it, there
-                # may be more network/tool requests.
-                continue
-        if time.monotonic() - start > CODE_EXEC_TIMEOUT:
-            proc.terminate()
-            raise subprocess.TimeoutExpired(cmd, CODE_EXEC_TIMEOUT)
-
-    if not done_read:
-        # Solve race condition where process terminates before
-        # we hit the while loop.
-        stdout, stderr = proc.communicate(timeout=0.3)
-
-    resp_w.close()
-    req_r.close()
-    return stdout, stderr, proc.returncode
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
deleted file mode 100644
index 10ac2fcc6..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import asyncio
-import logging
-import os
-import tempfile
-from typing import Any, Dict, Optional
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.tools import (
-    ListToolDefsResponse,
-    Tool,
-    ToolDef,
-    ToolInvocationResult,
-    ToolParameter,
-    ToolRuntime,
-)
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
-
-from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor
-from .config import CodeInterpreterToolConfig
-
-log = logging.getLogger(__name__)
-
-
-class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
-    def __init__(self, config: CodeInterpreterToolConfig):
-        self.config = config
-        ctx = CodeExecutionContext(
-            matplotlib_dump_dir=tempfile.mkdtemp(),
-        )
-        self.code_executor = CodeExecutor(ctx)
-
-    async def initialize(self):
-        pass
-
-    async def register_tool(self, tool: Tool) -> None:
-        pass
-
-    async def unregister_tool(self, tool_id: str) -> None:
-        return
-
-    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> ListToolDefsResponse:
-        return ListToolDefsResponse(
-            data=[
-                ToolDef(
-                    name="code_interpreter",
-                    description="Execute code",
-                    parameters=[
-                        ToolParameter(
-                            name="code",
-                            description="The code to execute",
-                            parameter_type="string",
-                        ),
-                    ],
-                )
-            ]
-        )
-
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
-        script = kwargs["code"]
-        # Use environment variable to control bwrap usage
-        force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
-        req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
-        res = await asyncio.to_thread(self.code_executor.execute, req)
-        pieces = [res["process_status"]]
-        for out_type in ["stdout", "stderr"]:
-            res_out = res[out_type]
-            if res_out != "":
-                pieces.extend([f"[{out_type}]", res_out, f"[/{out_type}]"])
-                if out_type == "stderr":
-                    log.error(f"ipython tool error: ↓\n{res_out}")
-        return ToolInvocationResult(content="\n".join(pieces))
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
deleted file mode 100644
index 7de1ec453..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict
-
-from pydantic import BaseModel
-
-
-class CodeInterpreterToolConfig(BaseModel):
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
-        return {}
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/matplotlib_custom_backend.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/matplotlib_custom_backend.py
deleted file mode 100644
index 6454358a5..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/matplotlib_custom_backend.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-A custom Matplotlib backend that overrides the show method to return image bytes.
-"""
-
-import base64
-import io
-import json as _json
-import logging
-
-import matplotlib
-from matplotlib.backend_bases import FigureManagerBase
-
-# Import necessary components from Matplotlib
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-
-log = logging.getLogger(__name__)
-
-
-class CustomFigureCanvas(FigureCanvasAgg):
-    def show(self):
-        # Save the figure to a BytesIO object
-        buf = io.BytesIO()
-        self.print_png(buf)
-        image_bytes = buf.getvalue()
-        buf.close()
-        return image_bytes
-
-
-class CustomFigureManager(FigureManagerBase):
-    def __init__(self, canvas, num):
-        super().__init__(canvas, num)
-
-
-# Mimic module initialization that integrates with the Matplotlib backend system
-def _create_figure_manager(num, *args, **kwargs):
-    """
-    Create a custom figure manager instance.
-    """
-    FigureClass = kwargs.pop("FigureClass", None)  # noqa: N806
-    if FigureClass is None:
-        from matplotlib.figure import Figure
-
-        FigureClass = Figure  # noqa: N806
-    fig = FigureClass(*args, **kwargs)
-    canvas = CustomFigureCanvas(fig)
-    manager = CustomFigureManager(canvas, num)
-    return manager
-
-
-def show():
-    """
-    Handle all figures and potentially return their images as bytes.
-
-    This function iterates over all figures registered with the custom backend,
-    renders them as images in bytes format, and could return a list of bytes objects,
-    one for each figure, or handle them as needed.
-    """
-    image_data = []
-    for manager in matplotlib._pylab_helpers.Gcf.get_all_fig_managers():
-        # Get the figure from the manager
-        fig = manager.canvas.figure
-        buf = io.BytesIO()  # Create a buffer for the figure
-        fig.savefig(buf, format="png")  # Save the figure to the buffer in PNG format
-        buf.seek(0)  # Go to the beginning of the buffer
-        image_bytes = buf.getvalue()  # Retrieve bytes value
-        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
-        image_data.append({"image_base64": image_base64})
-        buf.close()
-
-    # The _open_connections method is dynamically made available to
-    # the interpreter by bundling code from "code_env_prefix.py" -- by literally prefixing it -- and
-    # then "eval"ing it within a sandboxed interpreter.
-    req_con, resp_con = _open_connections()  # noqa: F821
-
-    _json_dump = _json.dumps(
-        {
-            "type": "matplotlib",
-            "image_data": image_data,
-        }
-    )
-    req_con.send_bytes(_json_dump.encode("utf-8"))
-    resp = _json.loads(resp_con.recv_bytes().decode("utf-8"))
-    log.info(resp)
-
-
-FigureCanvas = CustomFigureCanvas
-FigureManager = CustomFigureManager
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/utils.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/utils.py
deleted file mode 100644
index d6f539a39..000000000
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/utils.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-DIR = os.path.dirname(os.path.realpath(__file__))
-CODE_ENV_PREFIX_FILE = os.path.join(DIR, "code_env_prefix.py")
-CODE_ENV_PREFIX = None
-
-
-def get_code_env_prefix() -> str:
-    global CODE_ENV_PREFIX
-
-    if CODE_ENV_PREFIX is None:
-        with open(CODE_ENV_PREFIX_FILE, "r") as f:
-            CODE_ENV_PREFIX = f.read()
-
-    return CODE_ENV_PREFIX
diff --git a/llama_stack/providers/inline/tool_runtime/rag/__init__.py b/llama_stack/providers/inline/tool_runtime/rag/__init__.py
index 0ef3c35e9..f9a6e5c55 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/__init__.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.datatypes import Api
 
 from .config import RagToolRuntimeConfig
 
 
-async def get_provider_impl(config: RagToolRuntimeConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
     from .memory import MemoryToolRuntimeImpl
 
     impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference])
diff --git a/llama_stack/providers/inline/tool_runtime/rag/config.py b/llama_stack/providers/inline/tool_runtime/rag/config.py
index c75c3fc51..43ba78e65 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
 class RagToolRuntimeConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index 8d4689e5d..4776d47d0 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -8,7 +8,7 @@ import asyncio
 import logging
 import secrets
 import string
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from pydantic import TypeAdapter
 
@@ -25,14 +25,14 @@ from llama_stack.apis.tools import (
     RAGQueryConfig,
     RAGQueryResult,
     RAGToolRuntime,
-    Tool,
     ToolDef,
+    ToolGroup,
     ToolInvocationResult,
     ToolParameter,
     ToolRuntime,
 )
 from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
     content_from_doc,
@@ -49,7 +49,7 @@ def make_random_string(length: int = 8):
     return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
 
 
-class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
+class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
     def __init__(
         self,
         config: RagToolRuntimeConfig,
@@ -66,15 +66,15 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
     async def shutdown(self):
         pass
 
-    async def register_tool(self, tool: Tool) -> None:
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
         pass
 
-    async def unregister_tool(self, tool_id: str) -> None:
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         return
 
     async def insert(
         self,
-        documents: List[RAGDocument],
+        documents: list[RAGDocument],
         vector_db_id: str,
         chunk_size_in_tokens: int = 512,
     ) -> None:
@@ -87,6 +87,7 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                     content,
                     chunk_size_in_tokens,
                     chunk_size_in_tokens // 4,
+                    doc.metadata,
                 )
             )
 
@@ -101,11 +102,13 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
     async def query(
         self,
         content: InterleavedContent,
-        vector_db_ids: List[str],
-        query_config: Optional[RAGQueryConfig] = None,
+        vector_db_ids: list[str],
+        query_config: RAGQueryConfig | None = None,
     ) -> RAGQueryResult:
         if not vector_db_ids:
-            return RAGQueryResult(content=None)
+            raise ValueError(
+                "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
+            )
 
         query_config = query_config or RAGQueryConfig()
         query = await generate_rag_query(
@@ -119,11 +122,12 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                 query=query,
                 params={
                     "max_chunks": query_config.max_chunks,
+                    "mode": query_config.mode,
                 },
             )
             for vector_db_id in vector_db_ids
         ]
-        results: List[QueryChunksResponse] = await asyncio.gather(*tasks)
+        results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
         chunks = [c for r in results for c in r.chunks]
         scores = [s for r in results for s in r.scores]
 
@@ -140,19 +144,21 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                 text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
             )
         ]
-        for i, c in enumerate(chunks):
-            metadata = c.metadata
-            tokens += metadata["token_count"]
+        for i, chunk in enumerate(chunks):
+            metadata = chunk.metadata
+            tokens += metadata.get("token_count", 0)
+            tokens += metadata.get("metadata_token_count", 0)
+
             if tokens > query_config.max_tokens_in_context:
                 log.error(
                     f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
                 )
                 break
-            picked.append(
-                TextContentItem(
-                    text=f"Result {i + 1}:\nDocument_id:{metadata['document_id'][:5]}\nContent: {c.content}\n",
-                )
-            )
+
+            metadata_subset = {k: v for k, v in metadata.items() if k not in ["token_count", "metadata_token_count"]}
+            text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset)
+            picked.append(TextContentItem(text=text_content))
+
         picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
         picked.append(
             TextContentItem(
@@ -168,7 +174,7 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
         )
 
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
         # Parameters are not listed since these methods are not yet invoked automatically
         # by the LLM. The method is only implemented so things like /tools can list without
@@ -193,7 +199,7 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
             ]
         )
 
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         vector_db_ids = kwargs.get("vector_db_ids", [])
         query_config = kwargs.get("query_config")
         if query_config:
diff --git a/llama_stack/providers/inline/vector_io/chroma/__init__.py b/llama_stack/providers/inline/vector_io/chroma/__init__.py
index f39188b46..2e0efb8a1 100644
--- a/llama_stack/providers/inline/vector_io/chroma/__init__.py
+++ b/llama_stack/providers/inline/vector_io/chroma/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.datatypes import Api
 
 from .config import ChromaVectorIOConfig
 
 
-async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: ChromaVectorIOConfig, deps: dict[Api, Any]):
     from llama_stack.providers.remote.vector_io.chroma.chroma import (
         ChromaVectorIOAdapter,
     )
diff --git a/llama_stack/providers/inline/vector_io/chroma/config.py b/llama_stack/providers/inline/vector_io/chroma/config.py
index 1e333fe92..81e2f289e 100644
--- a/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/llama_stack/providers/inline/vector_io/chroma/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
     db_path: str
 
     @classmethod
-    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> dict[str, Any]:
         return {"db_path": db_path}
diff --git a/llama_stack/providers/inline/vector_io/faiss/__init__.py b/llama_stack/providers/inline/vector_io/faiss/__init__.py
index fc8ce70b4..68a1dee66 100644
--- a/llama_stack/providers/inline/vector_io/faiss/__init__.py
+++ b/llama_stack/providers/inline/vector_io/faiss/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.datatypes import Api
 
 from .config import FaissVectorIOConfig
 
 
-async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]):
     from .faiss import FaissVectorIOAdapter
 
     assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/inline/vector_io/faiss/config.py b/llama_stack/providers/inline/vector_io/faiss/config.py
index fa6e5bede..cbcbb1762 100644
--- a/llama_stack/providers/inline/vector_io/faiss/config.py
+++ b/llama_stack/providers/inline/vector_io/faiss/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -20,7 +20,7 @@ class FaissVectorIOConfig(BaseModel):
     kvstore: KVStoreConfig
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "kvstore": SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=__distro_dir__,
diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 20c795650..47256d88d 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -9,7 +9,7 @@ import base64
 import io
 import json
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import faiss
 import numpy as np
@@ -84,7 +84,7 @@ class FaissIndex(EmbeddingIndex):
 
         await self.kvstore.delete(f"{FAISS_INDEX_PREFIX}{self.bank_id}")
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         # Add dimension check
         embedding_dim = embeddings.shape[1] if len(embeddings.shape) > 1 else embeddings.shape[0]
         if embedding_dim != self.index.d:
@@ -99,9 +99,13 @@ class FaissIndex(EmbeddingIndex):
         # Save updated index
         await self._save_index()
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(
+        self,
+        embedding: NDArray,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
         distances, indices = await asyncio.to_thread(self.index.search, embedding.reshape(1, -1).astype(np.float32), k)
-
         chunks = []
         scores = []
         for d, i in zip(distances[0], indices[0], strict=False):
@@ -112,6 +116,14 @@ class FaissIndex(EmbeddingIndex):
 
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in FAISS")
+
 
 class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None:
@@ -125,7 +137,7 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         # Load existing banks from kvstore
         start_key = VECTOR_DBS_PREFIX
         end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_dbs = await self.kvstore.range(start_key, end_key)
+        stored_vector_dbs = await self.kvstore.values_in_range(start_key, end_key)
 
         for vector_db_data in stored_vector_dbs:
             vector_db = VectorDB.model_validate_json(vector_db_data)
@@ -159,7 +171,7 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
             inference_api=self.inference_api,
         )
 
-    async def list_vector_dbs(self) -> List[VectorDB]:
+    async def list_vector_dbs(self) -> list[VectorDB]:
         return [i.vector_db for i in self.cache.values()]
 
     async def unregister_vector_db(self, vector_db_id: str) -> None:
@@ -176,8 +188,8 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = self.cache.get(vector_db_id)
         if index is None:
@@ -189,7 +201,7 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = self.cache.get(vector_db_id)
         if index is None:
diff --git a/llama_stack/providers/inline/vector_io/milvus/__init__.py b/llama_stack/providers/inline/vector_io/milvus/__init__.py
index d88a3b005..fe3a1f7f9 100644
--- a/llama_stack/providers/inline/vector_io/milvus/__init__.py
+++ b/llama_stack/providers/inline/vector_io/milvus/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.datatypes import Api
 
 from .config import MilvusVectorIOConfig
 
 
-async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: MilvusVectorIOConfig, deps: dict[Api, Any]):
     from llama_stack.providers.remote.vector_io.milvus.milvus import MilvusVectorIOAdapter
 
     impl = MilvusVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/inline/vector_io/milvus/config.py b/llama_stack/providers/inline/vector_io/milvus/config.py
index 0e11d8c7c..eb22b5276 100644
--- a/llama_stack/providers/inline/vector_io/milvus/config.py
+++ b/llama_stack/providers/inline/vector_io/milvus/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -16,5 +16,5 @@ class MilvusVectorIOConfig(BaseModel):
     db_path: str
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {"db_path": "${env.MILVUS_DB_PATH}"}
diff --git a/llama_stack/providers/inline/vector_io/qdrant/__init__.py b/llama_stack/providers/inline/vector_io/qdrant/__init__.py
index 8f0b91c61..ee33b3797 100644
--- a/llama_stack/providers/inline/vector_io/qdrant/__init__.py
+++ b/llama_stack/providers/inline/vector_io/qdrant/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
 from .config import QdrantVectorIOConfig
 
 
-async def get_adapter_impl(config: QdrantVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: QdrantVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from llama_stack.providers.remote.vector_io.qdrant.qdrant import QdrantVectorIOAdapter
 
     impl = QdrantVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/inline/vector_io/qdrant/config.py b/llama_stack/providers/inline/vector_io/qdrant/config.py
index 282e951b0..283724b41 100644
--- a/llama_stack/providers/inline/vector_io/qdrant/config.py
+++ b/llama_stack/providers/inline/vector_io/qdrant/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -17,7 +17,7 @@ class QdrantVectorIOConfig(BaseModel):
     path: str
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
         return {
             "path": "${env.QDRANT_PATH:~/.llama/" + __distro_dir__ + "}/" + "qdrant.db",
         }
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
index 2380eb0ef..6db176eda 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.providers.datatypes import Api
 
 from .config import SQLiteVectorIOConfig
 
 
-async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, Any]):
+async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]):
     from .sqlite_vec import SQLiteVecVectorIOAdapter
 
     assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/config.py b/llama_stack/providers/inline/vector_io/sqlite_vec/config.py
index 906c19689..cb806cb39 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/config.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -13,7 +13,7 @@ class SQLiteVectorIOConfig(BaseModel):
     db_path: str
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
         return {
             "db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + "sqlite_vec.db",
         }
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
index 5f7671138..fc1a8ddb0 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@@ -10,7 +10,7 @@ import logging
 import sqlite3
 import struct
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 import sqlite_vec
@@ -24,8 +24,13 @@ from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, Vect
 
 logger = logging.getLogger(__name__)
 
+# Specifying search mode is dependent on the VectorIO provider.
+VECTOR_SEARCH = "vector"
+KEYWORD_SEARCH = "keyword"
+SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH}
 
-def serialize_vector(vector: List[float]) -> bytes:
+
+def serialize_vector(vector: list[float]) -> bytes:
     """Serialize a list of floats into a compact binary representation."""
     return struct.pack(f"{len(vector)}f", *vector)
 
@@ -45,6 +50,7 @@ class SQLiteVecIndex(EmbeddingIndex):
     Two tables are used:
       - A metadata table (chunks_{bank_id}) that holds the chunk JSON.
       - A virtual table (vec_chunks_{bank_id}) that holds the serialized vector.
+      - An FTS5 table (fts_chunks_{bank_id}) for full-text keyword search.
     """
 
     def __init__(self, dimension: int, db_path: str, bank_id: str):
@@ -53,6 +59,7 @@ class SQLiteVecIndex(EmbeddingIndex):
         self.bank_id = bank_id
         self.metadata_table = f"chunks_{bank_id}".replace("-", "_")
         self.vector_table = f"vec_chunks_{bank_id}".replace("-", "_")
+        self.fts_table = f"fts_chunks_{bank_id}".replace("-", "_")
 
     @classmethod
     async def create(cls, dimension: int, db_path: str, bank_id: str):
@@ -78,6 +85,14 @@ class SQLiteVecIndex(EmbeddingIndex):
                     USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
                 """)
                 connection.commit()
+                # FTS5 table (for keyword search) - creating both the tables by default. Will use the relevant one
+                # based on query. Implementation of the change on client side will allow passing the search_mode option
+                # during initialization to make it easier to create the table that is required.
+                cur.execute(f"""
+                            CREATE VIRTUAL TABLE IF NOT EXISTS {self.fts_table}
+                            USING fts5(id, content);
+                        """)
+                connection.commit()
             finally:
                 cur.close()
                 connection.close()
@@ -91,6 +106,7 @@ class SQLiteVecIndex(EmbeddingIndex):
             try:
                 cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
                 cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
+                cur.execute(f"DROP TABLE IF EXISTS {self.fts_table};")
                 connection.commit()
             finally:
                 cur.close()
@@ -98,12 +114,13 @@ class SQLiteVecIndex(EmbeddingIndex):
 
         await asyncio.to_thread(_drop_tables)
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray, batch_size: int = 500):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray, batch_size: int = 500):
         """
         Add new chunks along with their embeddings using batch inserts.
         For each chunk, we insert its JSON into the metadata table and then insert its
         embedding (serialized to raw bytes) into the virtual table using the assigned rowid.
         If any insert fails, the transaction is rolled back to maintain consistency.
+        Also inserts chunk content into FTS table for keyword search support.
         """
         assert all(isinstance(chunk.content, str) for chunk in chunks), "SQLiteVecIndex only supports text chunks"
 
@@ -112,18 +129,16 @@ class SQLiteVecIndex(EmbeddingIndex):
             cur = connection.cursor()
 
             try:
-                # Start transaction a single transcation for all batches
                 cur.execute("BEGIN TRANSACTION")
                 for i in range(0, len(chunks), batch_size):
                     batch_chunks = chunks[i : i + batch_size]
                     batch_embeddings = embeddings[i : i + batch_size]
-                    # Prepare metadata inserts
+
+                    # Insert metadata
                     metadata_data = [
                         (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
                         for chunk in batch_chunks
-                        if isinstance(chunk.content, str)
                     ]
-                    # Insert metadata (ON CONFLICT to avoid duplicates)
                     cur.executemany(
                         f"""
                         INSERT INTO {self.metadata_table} (id, chunk)
@@ -132,21 +147,43 @@ class SQLiteVecIndex(EmbeddingIndex):
                         """,
                         metadata_data,
                     )
-                    # Prepare embeddings inserts
+
+                    # Insert vector embeddings
                     embedding_data = [
                         (
-                            generate_chunk_id(chunk.metadata["document_id"], chunk.content),
-                            serialize_vector(emb.tolist()),
+                            (
+                                generate_chunk_id(chunk.metadata["document_id"], chunk.content),
+                                serialize_vector(emb.tolist()),
+                            )
                         )
                         for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
-                        if isinstance(chunk.content, str)
                     ]
-                    # Insert embeddings in batch
-                    cur.executemany(f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);", embedding_data)
+                    cur.executemany(
+                        f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);",
+                        embedding_data,
+                    )
+
+                    # Insert FTS content
+                    fts_data = [
+                        (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.content)
+                        for chunk in batch_chunks
+                    ]
+                    # DELETE existing entries with same IDs (FTS5 doesn't support ON CONFLICT)
+                    cur.executemany(
+                        f"DELETE FROM {self.fts_table} WHERE id = ?;",
+                        [(row[0],) for row in fts_data],
+                    )
+
+                    # INSERT new entries
+                    cur.executemany(
+                        f"INSERT INTO {self.fts_table} (id, content) VALUES (?, ?);",
+                        fts_data,
+                    )
+
                 connection.commit()
 
             except sqlite3.Error as e:
-                connection.rollback()  # Rollback on failure
+                connection.rollback()
                 logger.error(f"Error inserting into {self.vector_table}: {e}")
                 raise
 
@@ -154,22 +191,25 @@ class SQLiteVecIndex(EmbeddingIndex):
                 cur.close()
                 connection.close()
 
-        # Process all batches in a single thread
+        # Run batch insertion in a background thread
         await asyncio.to_thread(_execute_all_batch_inserts)
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(
+        self,
+        embedding: NDArray,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
         """
-        Query for the k most similar chunks. We convert the query embedding to a blob and run a SQL query
-        against the virtual table. The SQL joins the metadata table to recover the chunk JSON.
+        Performs vector-based search using a virtual table for vector similarity.
         """
-        emb_list = embedding.tolist() if isinstance(embedding, np.ndarray) else list(embedding)
-        emb_blob = serialize_vector(emb_list)
 
         def _execute_query():
             connection = _create_sqlite_connection(self.db_path)
             cur = connection.cursor()
-
             try:
+                emb_list = embedding.tolist() if isinstance(embedding, np.ndarray) else list(embedding)
+                emb_blob = serialize_vector(emb_list)
                 query_sql = f"""
                     SELECT m.id, m.chunk, v.distance
                     FROM {self.vector_table} AS v
@@ -184,17 +224,66 @@ class SQLiteVecIndex(EmbeddingIndex):
                 connection.close()
 
         rows = await asyncio.to_thread(_execute_query)
-
         chunks, scores = [], []
-        for _id, chunk_json, distance in rows:
+        for row in rows:
+            _id, chunk_json, distance = row
+            score = 1.0 / distance if distance != 0 else float("inf")
+            if score < score_threshold:
+                continue
+            try:
+                chunk = Chunk.model_validate_json(chunk_json)
+            except Exception as e:
+                logger.error(f"Error parsing chunk JSON for id {_id}: {e}")
+                continue
+            chunks.append(chunk)
+            scores.append(score)
+        return QueryChunksResponse(chunks=chunks, scores=scores)
+
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        """
+        Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
+        """
+        if query_string is None:
+            raise ValueError("query_string is required for keyword search.")
+
+        def _execute_query():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                query_sql = f"""
+                    SELECT DISTINCT m.id, m.chunk, bm25({self.fts_table}) AS score
+                    FROM {self.fts_table} AS f
+                    JOIN {self.metadata_table} AS m ON m.id = f.id
+                    WHERE f.content MATCH ?
+                    ORDER BY score ASC
+                    LIMIT ?;
+                """
+                cur.execute(query_sql, (query_string, k))
+                return cur.fetchall()
+            finally:
+                cur.close()
+                connection.close()
+
+        rows = await asyncio.to_thread(_execute_query)
+        chunks, scores = [], []
+        for row in rows:
+            _id, chunk_json, score = row
+            # BM25 scores returned by sqlite-vec are NEGATED (i.e., more relevant = more negative).
+            # This design is intentional to simplify sorting by ascending score.
+            # Reference: https://alexgarcia.xyz/blog/2024/sqlite-vec-hybrid-search/index.html
+            if score > -score_threshold:
+                continue
             try:
                 chunk = Chunk.model_validate_json(chunk_json)
             except Exception as e:
                 logger.error(f"Error parsing chunk JSON for id {_id}: {e}")
                 continue
             chunks.append(chunk)
-            # Mimic the Faiss scoring: score = 1/distance (avoid division by zero)
-            score = 1.0 / distance if distance != 0 else float("inf")
             scores.append(score)
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
@@ -209,7 +298,7 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(self, config, inference_api: Inference) -> None:
         self.config = config
         self.inference_api = inference_api
-        self.cache: Dict[str, VectorDBWithIndex] = {}
+        self.cache: dict[str, VectorDBWithIndex] = {}
 
     async def initialize(self) -> None:
         def _setup_connection():
@@ -264,7 +353,7 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
         self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
 
-    async def list_vector_dbs(self) -> List[VectorDB]:
+    async def list_vector_dbs(self) -> list[VectorDB]:
         return [v.vector_db for v in self.cache.values()]
 
     async def unregister_vector_db(self, vector_db_id: str) -> None:
@@ -286,7 +375,7 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
 
         await asyncio.to_thread(_delete_vector_db_from_registry)
 
-    async def insert_chunks(self, vector_db_id: str, chunks: List[Chunk], ttl_seconds: Optional[int] = None) -> None:
+    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
         if vector_db_id not in self.cache:
             raise ValueError(f"Vector DB {vector_db_id} not found. Found: {list(self.cache.keys())}")
         # The VectorDBWithIndex helper is expected to compute embeddings via the inference_api
@@ -294,7 +383,7 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         await self.cache[vector_db_id].insert_chunks(chunks)
 
     async def query_chunks(
-        self, vector_db_id: str, query: Any, params: Optional[Dict[str, Any]] = None
+        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
     ) -> QueryChunksResponse:
         if vector_db_id not in self.cache:
             raise ValueError(f"Vector DB {vector_db_id} not found")
@@ -303,5 +392,5 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
 
 def generate_chunk_id(document_id: str, chunk_text: str) -> str:
     """Generate a unique chunk ID using a hash of document ID and chunk text."""
-    hash_input = f"{document_id}:{chunk_text}".encode("utf-8")
+    hash_input = f"{document_id}:{chunk_text}".encode()
     return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))
diff --git a/llama_stack/providers/registry/agents.py b/llama_stack/providers/registry/agents.py
index 3ed59304d..e0801a8d1 100644
--- a/llama_stack/providers/registry/agents.py
+++ b/llama_stack/providers/registry/agents.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     Api,
@@ -14,7 +13,7 @@ from llama_stack.providers.datatypes import (
 from llama_stack.providers.utils.kvstore import kvstore_dependencies
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.agents,
diff --git a/llama_stack/providers/registry/datasetio.py b/llama_stack/providers/registry/datasetio.py
index f83dcbc60..152cc9cb9 100644
--- a/llama_stack/providers/registry/datasetio.py
+++ b/llama_stack/providers/registry/datasetio.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     AdapterSpec,
@@ -15,7 +14,7 @@ from llama_stack.providers.datatypes import (
 )
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.datasetio,
@@ -36,4 +35,15 @@ def available_providers() -> List[ProviderSpec]:
                 config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
             ),
         ),
+        remote_provider_spec(
+            api=Api.datasetio,
+            adapter=AdapterSpec(
+                adapter_type="nvidia",
+                pip_packages=[
+                    "datasets",
+                ],
+                module="llama_stack.providers.remote.datasetio.nvidia",
+                config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py
index 9604d5da4..c9c29bbe0 100644
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@@ -4,12 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.eval,
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 4040f0d80..7b49ef09b 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     AdapterSpec,
@@ -29,7 +28,7 @@ META_REFERENCE_DEPS = [
 ]
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.inference,
@@ -227,6 +226,16 @@ def available_providers() -> List[ProviderSpec]:
                 provider_data_validator="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksProviderDataValidator",
             ),
         ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="llama-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.llama_openai_compat",
+                config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator",
+            ),
+        ),
         remote_provider_spec(
             api=Api.inference,
             adapter=AdapterSpec(
@@ -271,11 +280,10 @@ def available_providers() -> List[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="sambanova",
-                pip_packages=[
-                    "openai",
-                ],
+                pip_packages=["litellm"],
                 module="llama_stack.providers.remote.inference.sambanova",
                 config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
             ),
         ),
         remote_provider_spec(
diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py
index 4d10fcf3b..d752b8819 100644
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@@ -4,12 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.post_training,
@@ -22,6 +21,17 @@ def available_providers() -> List[ProviderSpec]:
                 Api.datasets,
             ],
         ),
+        InlineProviderSpec(
+            api=Api.post_training,
+            provider_type="inline::huggingface",
+            pip_packages=["torch", "trl", "transformers", "peft", "datasets"],
+            module="llama_stack.providers.inline.post_training.huggingface",
+            config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
+            api_dependencies=[
+                Api.datasetio,
+                Api.datasets,
+            ],
+        ),
         remote_provider_spec(
             api=Api.post_training,
             adapter=AdapterSpec(
diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py
index 54dc51034..e0a04be48 100644
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     AdapterSpec,
@@ -15,7 +14,7 @@ from llama_stack.providers.datatypes import (
 )
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.safety,
@@ -64,4 +63,14 @@ def available_providers() -> List[ProviderSpec]:
                 config_class="llama_stack.providers.remote.safety.nvidia.NVIDIASafetyConfig",
             ),
         ),
+        remote_provider_spec(
+            api=Api.safety,
+            adapter=AdapterSpec(
+                adapter_type="sambanova",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.safety.sambanova",
+                config_class="llama_stack.providers.remote.safety.sambanova.SambaNovaSafetyConfig",
+                provider_data_validator="llama_stack.providers.remote.safety.sambanova.config.SambaNovaProviderDataValidator",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py
index ca09be984..7980d6a13 100644
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@@ -4,12 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.scoring,
diff --git a/llama_stack/providers/registry/telemetry.py b/llama_stack/providers/registry/telemetry.py
index fc249f3e2..14da06126 100644
--- a/llama_stack/providers/registry/telemetry.py
+++ b/llama_stack/providers/registry/telemetry.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     Api,
@@ -13,7 +12,7 @@ from llama_stack.providers.datatypes import (
 )
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.telemetry,
diff --git a/llama_stack/providers/registry/tool_runtime.py b/llama_stack/providers/registry/tool_runtime.py
index 95ea2dcf9..277914df2 100644
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     AdapterSpec,
@@ -15,7 +14,7 @@ from llama_stack.providers.datatypes import (
 )
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.tool_runtime,
@@ -36,13 +35,6 @@ def available_providers() -> List[ProviderSpec]:
             config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
             api_dependencies=[Api.vector_io, Api.inference],
         ),
-        InlineProviderSpec(
-            api=Api.tool_runtime,
-            provider_type="inline::code-interpreter",
-            pip_packages=[],
-            module="llama_stack.providers.inline.tool_runtime.code_interpreter",
-            config_class="llama_stack.providers.inline.tool_runtime.code_interpreter.config.CodeInterpreterToolConfig",
-        ),
         remote_provider_spec(
             api=Api.tool_runtime,
             adapter=AdapterSpec(
@@ -88,8 +80,9 @@ def available_providers() -> List[ProviderSpec]:
             adapter=AdapterSpec(
                 adapter_type="model-context-protocol",
                 module="llama_stack.providers.remote.tool_runtime.model_context_protocol",
-                config_class="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.ModelContextProtocolConfig",
+                config_class="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderConfig",
                 pip_packages=["mcp"],
+                provider_data_validator="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderDataValidator",
             ),
         ),
     ]
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index 93031763d..d888c8420 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.providers.datatypes import (
     AdapterSpec,
@@ -15,7 +14,7 @@ from llama_stack.providers.datatypes import (
 )
 
 
-def available_providers() -> List[ProviderSpec]:
+def available_providers() -> list[ProviderSpec]:
     return [
         InlineProviderSpec(
             api=Api.vector_io,
diff --git a/llama_stack/providers/remote/datasetio/huggingface/config.py b/llama_stack/providers/remote/datasetio/huggingface/config.py
index c06996b6f..38f933728 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/config.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -17,7 +17,7 @@ class HuggingfaceDatasetIOConfig(BaseModel):
     kvstore: KVStoreConfig
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "kvstore": SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=__distro_dir__,
diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
index 7a17e5e42..fafd1d8ff 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional
+from typing import Any
 from urllib.parse import parse_qs, urlparse
 
 import datasets as hf_datasets
@@ -12,8 +12,8 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import HuggingfaceDatasetIOConfig
 
@@ -42,7 +42,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         # Load existing datasets from kvstore
         start_key = DATASETS_PREFIX
         end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.range(start_key, end_key)
+        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
 
         for dataset in stored_datasets:
             dataset = Dataset.model_validate_json(dataset)
@@ -70,8 +70,8 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
     async def iterrows(
         self,
         dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
+        start_index: int | None = None,
+        limit: int | None = None,
     ) -> PaginatedResponse:
         dataset_def = self.dataset_infos[dataset_id]
         path, params = parse_hf_params(dataset_def)
@@ -80,7 +80,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
         return paginate_records(records, start_index, limit)
 
-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
         dataset_def = self.dataset_infos[dataset_id]
         path, params = parse_hf_params(dataset_def)
         loaded_dataset = hf_datasets.load_dataset(path, **params)
diff --git a/llama_stack/providers/remote/datasetio/nvidia/README.md b/llama_stack/providers/remote/datasetio/nvidia/README.md
new file mode 100644
index 000000000..1d3d15132
--- /dev/null
+++ b/llama_stack/providers/remote/datasetio/nvidia/README.md
@@ -0,0 +1,74 @@
+# NVIDIA DatasetIO Provider for LlamaStack
+
+This provider enables dataset management using NVIDIA's NeMo Customizer service.
+
+## Features
+
+- Register datasets for fine-tuning LLMs
+- Unregister datasets
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to Hosted NVIDIA NeMo Microservice
+- API key for authentication with the NVIDIA service
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = "your-api-key"
+os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
+os.environ["NVIDIA_USER_ID"] = "llama-stack-user"
+os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
+os.environ["NVIDIA_PROJECT_ID"] = "test-project"
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+#### Register a dataset
+
+```python
+client.datasets.register(
+    purpose="post-training/messages",
+    dataset_id="my-training-dataset",
+    source={"type": "uri", "uri": "hf://datasets/default/sample-dataset"},
+    metadata={
+        "format": "json",
+        "description": "Dataset for LLM fine-tuning",
+        "provider": "nvidia",
+    },
+)
+```
+
+#### Get a list of all registered datasets
+
+```python
+datasets = client.datasets.list()
+for dataset in datasets:
+    print(f"Dataset ID: {dataset.identifier}")
+    print(f"Description: {dataset.metadata.get('description', '')}")
+    print(f"Source: {dataset.source.uri}")
+    print("---")
+```
+
+#### Unregister a dataset
+
+```python
+client.datasets.unregister(dataset_id="my-training-dataset")
+```
diff --git a/llama_stack/providers/remote/datasetio/nvidia/__init__.py b/llama_stack/providers/remote/datasetio/nvidia/__init__.py
new file mode 100644
index 000000000..418daec8d
--- /dev/null
+++ b/llama_stack/providers/remote/datasetio/nvidia/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import NvidiaDatasetIOConfig
+
+
+async def get_adapter_impl(
+    config: NvidiaDatasetIOConfig,
+    _deps,
+):
+    from .datasetio import NvidiaDatasetIOAdapter
+
+    if not isinstance(config, NvidiaDatasetIOConfig):
+        raise RuntimeError(f"Unexpected config type: {type(config)}")
+
+    impl = NvidiaDatasetIOAdapter(config)
+    return impl
+
+
+__all__ = ["get_adapter_impl", "NvidiaDatasetIOAdapter"]
diff --git a/llama_stack/providers/remote/datasetio/nvidia/config.py b/llama_stack/providers/remote/datasetio/nvidia/config.py
new file mode 100644
index 000000000..e616ce25c
--- /dev/null
+++ b/llama_stack/providers/remote/datasetio/nvidia/config.py
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import warnings
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+class NvidiaDatasetIOConfig(BaseModel):
+    """Configuration for NVIDIA DatasetIO implementation."""
+
+    api_key: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
+        description="The NVIDIA API key.",
+    )
+
+    dataset_namespace: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
+        description="The NVIDIA dataset namespace.",
+    )
+
+    project_id: str | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
+        description="The NVIDIA project ID.",
+    )
+
+    datasets_url: str = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"),
+        description="Base URL for the NeMo Dataset API",
+    )
+
+    # warning for default values
+    def __post_init__(self):
+        default_values = []
+        if os.getenv("NVIDIA_PROJECT_ID") is None:
+            default_values.append("project_id='test-project'")
+        if os.getenv("NVIDIA_DATASET_NAMESPACE") is None:
+            default_values.append("dataset_namespace='default'")
+        if os.getenv("NVIDIA_DATASETS_URL") is None:
+            default_values.append("datasets_url='http://nemo.test'")
+
+        if default_values:
+            warnings.warn(
+                f"Using default values: {', '.join(default_values)}. \
+                          Please set the environment variables to avoid this default behavior.",
+                stacklevel=2,
+            )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+        return {
+            "api_key": "${env.NVIDIA_API_KEY:}",
+            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
+            "project_id": "${env.NVIDIA_PROJECT_ID:test-project}",
+            "datasets_url": "${env.NVIDIA_DATASETS_URL:http://nemo.test}",
+        }
diff --git a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
new file mode 100644
index 000000000..6a9e2bb58
--- /dev/null
+++ b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+import aiohttp
+
+from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.common.type_system import ParamType
+from llama_stack.apis.datasets import Dataset
+
+from .config import NvidiaDatasetIOConfig
+
+
+class NvidiaDatasetIOAdapter:
+    """Nvidia NeMo DatasetIO API."""
+
+    def __init__(self, config: NvidiaDatasetIOConfig):
+        self.config = config
+        self.headers = {}
+
+    async def _make_request(
+        self,
+        method: str,
+        path: str,
+        headers: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Helper method to make HTTP requests to the Customizer API."""
+        url = f"{self.config.datasets_url}{path}"
+        request_headers = self.headers.copy()
+
+        if headers:
+            request_headers.update(headers)
+
+        async with aiohttp.ClientSession(headers=request_headers) as session:
+            async with session.request(method, url, params=params, json=json, **kwargs) as response:
+                if response.status != 200:
+                    error_data = await response.json()
+                    raise Exception(f"API request failed: {error_data}")
+                return await response.json()
+
+    async def register_dataset(
+        self,
+        dataset_def: Dataset,
+    ) -> Dataset:
+        """Register a new dataset.
+
+        Args:
+            dataset_def [Dataset]: The dataset definition.
+                dataset_id [str]: The ID of the dataset.
+                source [DataSource]: The source of the dataset.
+                metadata [Dict[str, Any]]: The metadata of the dataset.
+                    format [str]: The format of the dataset.
+                    description [str]: The description of the dataset.
+        Returns:
+            Dataset
+        """
+        ## add warnings for unsupported params
+        request_body = {
+            "name": dataset_def.identifier,
+            "namespace": self.config.dataset_namespace,
+            "files_url": dataset_def.source.uri,
+            "project": self.config.project_id,
+        }
+        if dataset_def.metadata:
+            request_body["format"] = dataset_def.metadata.get("format")
+            request_body["description"] = dataset_def.metadata.get("description")
+        await self._make_request(
+            "POST",
+            "/v1/datasets",
+            json=request_body,
+        )
+        return dataset_def
+
+    async def update_dataset(
+        self,
+        dataset_id: str,
+        dataset_schema: dict[str, ParamType],
+        url: URL,
+        provider_dataset_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        raise NotImplementedError("Not implemented")
+
+    async def unregister_dataset(
+        self,
+        dataset_id: str,
+    ) -> None:
+        await self._make_request(
+            "DELETE",
+            f"/v1/datasets/{self.config.dataset_namespace}/{dataset_id}",
+            headers={"Accept": "application/json", "Content-Type": "application/json"},
+        )
+
+    async def iterrows(
+        self,
+        dataset_id: str,
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
+        raise NotImplementedError("Not implemented")
+
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        raise NotImplementedError("Not implemented")
diff --git a/llama_stack/providers/remote/eval/nvidia/__init__.py b/llama_stack/providers/remote/eval/nvidia/__init__.py
index 8abbec9b2..55e3754f3 100644
--- a/llama_stack/providers/remote/eval/nvidia/__init__.py
+++ b/llama_stack/providers/remote/eval/nvidia/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any
 
 from llama_stack.distribution.datatypes import Api
 
@@ -12,7 +12,7 @@ from .config import NVIDIAEvalConfig
 
 async def get_adapter_impl(
     config: NVIDIAEvalConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
     from .eval import NVIDIAEvalImpl
 
diff --git a/llama_stack/providers/remote/eval/nvidia/config.py b/llama_stack/providers/remote/eval/nvidia/config.py
index b660fcd68..5c8f9ff76 100644
--- a/llama_stack/providers/remote/eval/nvidia/config.py
+++ b/llama_stack/providers/remote/eval/nvidia/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -23,7 +23,7 @@ class NVIDIAEvalConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "evaluator_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
         }
diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py
index e1a3b5355..3572de0ef 100644
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List
+from typing import Any
 
 import requests
 
@@ -101,8 +101,8 @@ class NVIDIAEvalImpl(
     async def evaluate_rows(
         self,
         benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: list[str],
         benchmark_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/anthropic/__init__.py b/llama_stack/providers/remote/inference/anthropic/__init__.py
index 3075f856e..8b420a5a0 100644
--- a/llama_stack/providers/remote/inference/anthropic/__init__.py
+++ b/llama_stack/providers/remote/inference/anthropic/__init__.py
@@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
-
 from pydantic import BaseModel
 
 from .config import AnthropicConfig
 
 
 class AnthropicProviderDataValidator(BaseModel):
-    anthropic_api_key: Optional[str] = None
+    anthropic_api_key: str | None = None
 
 
 async def get_adapter_impl(config: AnthropicConfig, _deps):
diff --git a/llama_stack/providers/remote/inference/anthropic/config.py b/llama_stack/providers/remote/inference/anthropic/config.py
index 0e9469602..10da0025e 100644
--- a/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/llama_stack/providers/remote/inference/anthropic/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class AnthropicProviderDataValidator(BaseModel):
-    anthropic_api_key: Optional[str] = Field(
+    anthropic_api_key: str | None = Field(
         default=None,
         description="API key for Anthropic models",
     )
@@ -20,13 +20,13 @@ class AnthropicProviderDataValidator(BaseModel):
 
 @json_schema_type
 class AnthropicConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="API key for Anthropic models",
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "api_key": api_key,
         }
diff --git a/llama_stack/providers/remote/inference/bedrock/__init__.py b/llama_stack/providers/remote/inference/bedrock/__init__.py
index e72c6ada9..4d98f4999 100644
--- a/llama_stack/providers/remote/inference/bedrock/__init__.py
+++ b/llama_stack/providers/remote/inference/bedrock/__init__.py
@@ -1,18 +1,18 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from .config import BedrockConfig
-
-
-async def get_adapter_impl(config: BedrockConfig, _deps):
-    from .bedrock import BedrockInferenceAdapter
-
-    assert isinstance(config, BedrockConfig), f"Unexpected config type: {type(config)}"
-
-    impl = BedrockInferenceAdapter(config)
-
-    await impl.initialize()
-
-    return impl
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .config import BedrockConfig
+
+
+async def get_adapter_impl(config: BedrockConfig, _deps):
+    from .bedrock import BedrockInferenceAdapter
+
+    assert isinstance(config, BedrockConfig), f"Unexpected config type: {type(config)}"
+
+    impl = BedrockInferenceAdapter(config)
+
+    await impl.initialize()
+
+    return impl
diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py
index f8dbcf31a..952d86f1a 100644
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import json
-from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
 
 from botocore.client import BaseClient
 
@@ -22,6 +22,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -79,26 +80,26 @@ class BedrockInferenceAdapter(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         raise NotImplementedError()
 
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self.model_store.get_model(model_id)
@@ -151,7 +152,7 @@ class BedrockInferenceAdapter(
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> Dict:
+    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> dict:
         bedrock_model = request.model
 
         sampling_params = request.sampling_params
@@ -176,10 +177,10 @@ class BedrockInferenceAdapter(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
         embeddings = []
@@ -197,3 +198,13 @@ class BedrockInferenceAdapter(
             response_body = json.loads(response.get("body").read())
             embeddings.append(response_body.get("embedding"))
         return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/bedrock/config.py b/llama_stack/providers/remote/inference/bedrock/config.py
index f2e8930be..5961a2f15 100644
--- a/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/llama_stack/providers/remote/inference/bedrock/config.py
@@ -1,11 +1,11 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
-
-
-class BedrockConfig(BedrockBaseConfig):
-    pass
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
+
+
+class BedrockConfig(BedrockBaseConfig):
+    pass
diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py
index 3156601be..952118e24 100644
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import AsyncGenerator, List, Optional, Union
+from collections.abc import AsyncGenerator
 
 from cerebras.cloud.sdk import AsyncCerebras
 
@@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -79,10 +80,10 @@ class CerebrasInferenceAdapter(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -120,15 +121,15 @@ class CerebrasInferenceAdapter(
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -166,7 +167,7 @@ class CerebrasInferenceAdapter(
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         if request.sampling_params and isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
             raise ValueError("`top_k` not supported by Cerebras")
 
@@ -188,9 +189,19 @@ class CerebrasInferenceAdapter(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/cerebras/config.py b/llama_stack/providers/remote/inference/cerebras/config.py
index 81682c980..81312ec76 100644
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import os
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -20,13 +20,13 @@ class CerebrasImplConfig(BaseModel):
         default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL),
         description="Base URL for the Cerebras API",
     )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default=os.environ.get("CEREBRAS_API_KEY"),
         description="Cerebras API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "base_url": DEFAULT_BASE_URL,
             "api_key": "${env.CEREBRAS_API_KEY}",
diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
index a5f07edd2..523a8dfe7 100644
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import CerebrasCompatConfig
 
 
-async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .cerebras import CerebrasCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
index 149c0a202..cb8daff6a 100644
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class CerebrasProviderDataValidator(BaseModel):
-    cerebras_api_key: Optional[str] = Field(
+    cerebras_api_key: str | None = Field(
         default=None,
         description="API key for Cerebras models",
     )
@@ -20,7 +20,7 @@ class CerebrasProviderDataValidator(BaseModel):
 
 @json_schema_type
 class CerebrasCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Cerebras API key",
     )
@@ -31,7 +31,7 @@ class CerebrasCompatConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "openai_compat_api_base": "https://api.cerebras.ai/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/databricks/config.py b/llama_stack/providers/remote/inference/databricks/config.py
index 1d51125cb..5710dcef3 100644
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -28,7 +28,7 @@ class DatabricksImplConfig(BaseModel):
         url: str = "${env.DATABRICKS_URL}",
         api_token: str = "${env.DATABRICKS_API_TOKEN}",
         **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         return {
             "url": url,
             "api_token": api_token,
diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py
index 27d96eb7d..1dc18b97f 100644
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import AsyncGenerator, List, Optional
+from collections.abc import AsyncGenerator
 
 from openai import OpenAI
 
@@ -20,6 +20,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -78,25 +79,25 @@ class DatabricksInferenceAdapter(
         self,
         model: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         raise NotImplementedError()
 
     async def chat_completion(
         self,
         model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -146,9 +147,19 @@ class DatabricksInferenceAdapter(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/fireworks/config.py b/llama_stack/providers/remote/inference/fireworks/config.py
index c21ce4a40..072d558f4 100644
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -17,13 +17,13 @@ class FireworksImplConfig(BaseModel):
         default="https://api.fireworks.ai/inference/v1",
         description="The URL for the Fireworks server",
     )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default=None,
         description="The Fireworks.ai API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "url": "https://api.fireworks.ai/inference/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 58678a9cc..fe21685dd 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 from fireworks.client import Fireworks
 from openai import AsyncOpenAI
@@ -36,6 +37,7 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingsResponse,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -105,10 +107,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -146,9 +148,9 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
 
     def _build_options(
         self,
-        sampling_params: Optional[SamplingParams],
+        sampling_params: SamplingParams | None,
         fmt: ResponseFormat,
-        logprobs: Optional[LogProbConfig],
+        logprobs: LogProbConfig | None,
     ) -> dict:
         options = get_sampling_options(sampling_params)
         options.setdefault("max_tokens", 512)
@@ -177,15 +179,15 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -229,7 +231,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         input_dict = {}
         media_present = request_has_media(request)
 
@@ -263,10 +265,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
 
@@ -285,27 +287,37 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         model_obj = await self.model_store.get_model(model)
 
@@ -338,29 +350,29 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self.model_store.get_model(model)
 
         # Divert Llama Models through Llama Stack inference APIs because
diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
index f78f218b5..15a666cb6 100644
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import FireworksCompatConfig
 
 
-async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .fireworks import FireworksCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
index 0263d348a..bf38cdd2b 100644
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class FireworksProviderDataValidator(BaseModel):
-    fireworks_api_key: Optional[str] = Field(
+    fireworks_api_key: str | None = Field(
         default=None,
         description="API key for Fireworks models",
     )
@@ -20,7 +20,7 @@ class FireworksProviderDataValidator(BaseModel):
 
 @json_schema_type
 class FireworksCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Fireworks API key",
     )
@@ -31,7 +31,7 @@ class FireworksCompatConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "openai_compat_api_base": "https://api.fireworks.ai/inference/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/gemini/__init__.py b/llama_stack/providers/remote/inference/gemini/__init__.py
index dd972f21c..9d35da893 100644
--- a/llama_stack/providers/remote/inference/gemini/__init__.py
+++ b/llama_stack/providers/remote/inference/gemini/__init__.py
@@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
-
 from pydantic import BaseModel
 
 from .config import GeminiConfig
 
 
 class GeminiProviderDataValidator(BaseModel):
-    gemini_api_key: Optional[str] = None
+    gemini_api_key: str | None = None
 
 
 async def get_adapter_impl(config: GeminiConfig, _deps):
diff --git a/llama_stack/providers/remote/inference/gemini/config.py b/llama_stack/providers/remote/inference/gemini/config.py
index 30c8d9913..63ef4de01 100644
--- a/llama_stack/providers/remote/inference/gemini/config.py
+++ b/llama_stack/providers/remote/inference/gemini/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class GeminiProviderDataValidator(BaseModel):
-    gemini_api_key: Optional[str] = Field(
+    gemini_api_key: str | None = Field(
         default=None,
         description="API key for Gemini models",
     )
@@ -20,13 +20,13 @@ class GeminiProviderDataValidator(BaseModel):
 
 @json_schema_type
 class GeminiConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="API key for Gemini models",
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "api_key": api_key,
         }
diff --git a/llama_stack/providers/remote/inference/groq/config.py b/llama_stack/providers/remote/inference/groq/config.py
index 8a1204b0b..fe060507a 100644
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class GroqProviderDataValidator(BaseModel):
-    groq_api_key: Optional[str] = Field(
+    groq_api_key: str | None = Field(
         default=None,
         description="API key for Groq models",
     )
@@ -20,7 +20,7 @@ class GroqProviderDataValidator(BaseModel):
 
 @json_schema_type
 class GroqConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         # The Groq client library loads the GROQ_API_KEY environment variable by default
         default=None,
         description="The Groq API key",
@@ -32,7 +32,7 @@ class GroqConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "url": "https://api.groq.com",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index f3f14e9af..27d7d7961 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncIterator
+from typing import Any
 
 from openai import AsyncOpenAI
 
@@ -59,29 +60,29 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self.model_store.get_model(model)
 
         # Groq does not support json_schema response format, so we need to convert it to json_object
diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
index 8161df20d..794cdebd7 100644
--- a/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import GroqCompatConfig
 
 
-async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .groq import GroqCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/config.py b/llama_stack/providers/remote/inference/groq_openai_compat/config.py
index 4b90b4576..481f740f9 100644
--- a/llama_stack/providers/remote/inference/groq_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class GroqProviderDataValidator(BaseModel):
-    groq_api_key: Optional[str] = Field(
+    groq_api_key: str | None = Field(
         default=None,
         description="API key for Groq models",
     )
@@ -20,7 +20,7 @@ class GroqProviderDataValidator(BaseModel):
 
 @json_schema_type
 class GroqCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Groq API key",
     )
@@ -31,7 +31,7 @@ class GroqCompatConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "openai_compat_api_base": "https://api.groq.com/openai/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py b/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py
new file mode 100644
index 000000000..be48d1067
--- /dev/null
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import InferenceProvider
+
+from .config import LlamaCompatConfig
+
+
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
+    # import dynamically so the import is used only when it is needed
+    from .llama import LlamaCompatInferenceAdapter
+
+    adapter = LlamaCompatInferenceAdapter(config)
+    return adapter
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/config.py b/llama_stack/providers/remote/inference/llama_openai_compat/config.py
new file mode 100644
index 000000000..57bc7240d
--- /dev/null
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class LlamaProviderDataValidator(BaseModel):
+    llama_api_key: str | None = Field(
+        default=None,
+        description="API key for api.llama models",
+    )
+
+
+@json_schema_type
+class LlamaCompatConfig(BaseModel):
+    api_key: str | None = Field(
+        default=None,
+        description="The Llama API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.llama.com/compat/v1/",
+        description="The URL for the Llama API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
+            "api_key": api_key,
+        }
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
new file mode 100644
index 000000000..29b5e889a
--- /dev/null
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.llama_openai_compat.config import (
+    LlamaCompatConfig,
+)
+from llama_stack.providers.utils.inference.litellm_openai_mixin import (
+    LiteLLMOpenAIMixin,
+)
+
+from .models import MODEL_ENTRIES
+
+
+class LlamaCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: LlamaCompatConfig
+
+    def __init__(self, config: LlamaCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="llama_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/models.py b/llama_stack/providers/remote/inference/llama_openai_compat/models.py
new file mode 100644
index 000000000..6285e98e1
--- /dev/null
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/models.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.models.llama.sku_types import CoreModelId
+from llama_stack.providers.utils.inference.model_registry import (
+    build_hf_repo_model_entry,
+)
+
+MODEL_ENTRIES = [
+    build_hf_repo_model_entry(
+        "Llama-3.3-70B-Instruct",
+        CoreModelId.llama3_3_70b_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "Llama-4-Scout-17B-16E-Instruct-FP8",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "Llama-4-Maverick-17B-128E-Instruct-FP8",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
+]
diff --git a/llama_stack/providers/remote/inference/nvidia/config.py b/llama_stack/providers/remote/inference/nvidia/config.py
index 8f80408d4..4c449edc2 100644
--- a/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/llama_stack/providers/remote/inference/nvidia/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import os
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -39,7 +39,7 @@ class NVIDIAConfig(BaseModel):
         default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"),
         description="A base url for accessing the NVIDIA NIM",
     )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
         description="The NVIDIA API key, only needed of using the hosted service",
     )
@@ -53,7 +53,7 @@ class NVIDIAConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "url": "${env.NVIDIA_BASE_URL:https://integrate.api.nvidia.com}",
             "api_key": "${env.NVIDIA_API_KEY:}",
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 4a62ad6cb..4c68322e0 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -6,8 +6,9 @@
 
 import logging
 import warnings
+from collections.abc import AsyncIterator
 from functools import lru_cache
-from typing import Any, AsyncIterator, Dict, List, Optional, Union
+from typing import Any
 
 from openai import APIConnectionError, AsyncOpenAI, BadRequestError
 
@@ -28,6 +29,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -141,11 +143,11 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
         if sampling_params is None:
             sampling_params = SamplingParams()
         if content_has_media(content):
@@ -182,20 +184,20 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         if any(content_has_media(content) for content in contents):
             raise NotImplementedError("Media is not supported")
 
         #
-        # Llama Stack: contents = List[str] | List[InterleavedContentItem]
+        # Llama Stack: contents = list[str] | list[InterleavedContentItem]
         #  ->
-        # OpenAI: input = str | List[str]
+        # OpenAI: input = str | list[str]
         #
-        # we can ignore str and always pass List[str] to OpenAI
+        # we can ignore str and always pass list[str] to OpenAI
         #
         flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
         input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
@@ -231,25 +233,35 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
             raise ValueError(f"Failed to get embeddings: {e}") from e
 
         #
-        # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=List[float], ...)], ...)
+        # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
         #  ->
-        # Llama Stack: EmbeddingsResponse(embeddings=List[List[float]])
+        # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
         #
         return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         if sampling_params is None:
             sampling_params = SamplingParams()
         if tool_prompt_format:
@@ -286,24 +298,24 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         provider_model_id = await self._get_provider_model_id(model)
 
@@ -335,29 +347,29 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         provider_model_id = await self._get_provider_model_id(model)
 
         params = await prepare_openai_completion_params(
diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
index 3f2769b26..0b0d7fcf3 100644
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@@ -5,7 +5,8 @@
 # the root directory of this source tree.
 
 import warnings
-from typing import Any, AsyncGenerator, Dict, List, Optional
+from collections.abc import AsyncGenerator
+from typing import Any
 
 from openai import AsyncStream
 from openai.types.chat.chat_completion import (
@@ -64,7 +65,7 @@ async def convert_chat_completion_request(
         )
 
     nvext = {}
-    payload: Dict[str, Any] = dict(
+    payload: dict[str, Any] = dict(
         model=request.model,
         messages=[await convert_message_to_openai_dict_new(message) for message in request.messages],
         stream=request.stream,
@@ -137,7 +138,7 @@ def convert_completion_request(
     # logprobs.top_k -> logprobs
 
     nvext = {}
-    payload: Dict[str, Any] = dict(
+    payload: dict[str, Any] = dict(
         model=request.model,
         prompt=request.content,
         stream=request.stream,
@@ -176,8 +177,8 @@ def convert_completion_request(
 
 
 def _convert_openai_completion_logprobs(
-    logprobs: Optional[OpenAICompletionLogprobs],
-) -> Optional[List[TokenLogProbs]]:
+    logprobs: OpenAICompletionLogprobs | None,
+) -> list[TokenLogProbs] | None:
     """
     Convert an OpenAI CompletionLogprobs into a list of TokenLogProbs.
     """
diff --git a/llama_stack/providers/remote/inference/nvidia/utils.py b/llama_stack/providers/remote/inference/nvidia/utils.py
index 7d3f3f27e..74019999e 100644
--- a/llama_stack/providers/remote/inference/nvidia/utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/utils.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 import logging
-from typing import Tuple
 
 import httpx
 
@@ -18,7 +17,7 @@ def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:
     return "integrate.api.nvidia.com" in config.url
 
 
-async def _get_health(url: str) -> Tuple[bool, bool]:
+async def _get_health(url: str) -> tuple[bool, bool]:
     """
     Query {url}/v1/health/{live,ready} to check if the server is running and ready
 
diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py
index a5a4d48ab..0e4aef0e1 100644
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -15,5 +15,5 @@ class OllamaImplConfig(BaseModel):
     url: str = DEFAULT_OLLAMA_URL
 
     @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]:
         return {"url": url}
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index cdfe7b568..8863e0edc 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -5,10 +5,11 @@
 # the root directory of this source tree.
 
 
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 import httpx
-from ollama import AsyncClient
+from ollama import AsyncClient  # type: ignore[attr-defined]
 from openai import AsyncOpenAI
 
 from llama_stack.apis.common.content_types import (
@@ -27,10 +28,11 @@ from llama_stack.apis.inference import (
     EmbeddingsResponse,
     EmbeddingTaskType,
     GrammarResponseFormat,
-    Inference,
+    InferenceProvider,
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -60,6 +62,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
     get_sampling_options,
+    prepare_openai_completion_params,
     process_chat_completion_response,
     process_chat_completion_stream_response,
     process_completion_response,
@@ -80,7 +83,7 @@ logger = get_logger(name=__name__, category="inference")
 
 
 class OllamaInferenceAdapter(
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, url: str) -> None:
@@ -130,14 +133,16 @@ class OllamaInferenceAdapter(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = CompletionRequest(
             model=model.provider_resource_id,
             content=content,
@@ -188,19 +193,21 @@ class OllamaInferenceAdapter(
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = ChatCompletionRequest(
             model=model.provider_resource_id,
             messages=messages,
@@ -216,7 +223,7 @@ class OllamaInferenceAdapter(
         else:
             return await self._nonstream_chat_completion(request)
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         sampling_options = get_sampling_options(request.sampling_params)
         # This is needed since the Ollama API expects num_predict to be set
         # for early truncation instead of max_tokens.
@@ -314,10 +321,10 @@ class OllamaInferenceAdapter(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self._get_model(model_id)
 
@@ -333,7 +340,10 @@ class OllamaInferenceAdapter(
         return EmbeddingsResponse(embeddings=embeddings)
 
     async def register_model(self, model: Model) -> Model:
-        model = await self.register_helper.register_model(model)
+        try:
+            model = await self.register_helper.register_model(model)
+        except ValueError:
+            pass  # Ignore statically unknown model, will check live listing
         if model.model_type == ModelType.embedding:
             logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
             await self.client.pull(model.provider_resource_id)
@@ -342,9 +352,14 @@ class OllamaInferenceAdapter(
         #  - models not currently running are run by the ollama server as needed
         response = await self.client.list()
         available_models = [m["model"] for m in response["models"]]
-        if model.provider_resource_id not in available_models:
+        if model.provider_resource_id is None:
+            raise ValueError("Model provider_resource_id cannot be None")
+        provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
+        if provider_resource_id is None:
+            provider_resource_id = model.provider_resource_id
+        if provider_resource_id not in available_models:
             available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]]
-            if model.provider_resource_id in available_models_latest:
+            if provider_resource_id in available_models_latest:
                 logger.warning(
                     f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
                 )
@@ -352,142 +367,145 @@ class OllamaInferenceAdapter(
             raise ValueError(
                 f"Model '{model.provider_resource_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
             )
+        model.provider_resource_id = provider_resource_id
 
         return model
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         if not isinstance(prompt, str):
             raise ValueError("Ollama does not support non-string prompts for completion")
 
         model_obj = await self._get_model(model)
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "prompt": prompt,
-                "best_of": best_of,
-                "echo": echo,
-                "frequency_penalty": frequency_penalty,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_tokens": max_tokens,
-                "n": n,
-                "presence_penalty": presence_penalty,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.completions.create(**params)  # type: ignore
 
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self._get_model(model)
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "messages": messages,
-                "frequency_penalty": frequency_penalty,
-                "function_call": function_call,
-                "functions": functions,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_completion_tokens": max_completion_tokens,
-                "max_tokens": max_tokens,
-                "n": n,
-                "parallel_tool_calls": parallel_tool_calls,
-                "presence_penalty": presence_penalty,
-                "response_format": response_format,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "tool_choice": tool_choice,
-                "tools": tools,
-                "top_logprobs": top_logprobs,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.chat.completions.create(**params)  # type: ignore
 
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch completion is not supported for Ollama")
 
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch chat completion is not supported for Ollama")
 
 
-async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
+async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
     async def _convert_content(content) -> dict:
         if isinstance(content, ImageContentItem):
             return {
diff --git a/llama_stack/providers/remote/inference/openai/__init__.py b/llama_stack/providers/remote/inference/openai/__init__.py
index 000a03d33..c245dbe10 100644
--- a/llama_stack/providers/remote/inference/openai/__init__.py
+++ b/llama_stack/providers/remote/inference/openai/__init__.py
@@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
-
 from pydantic import BaseModel
 
 from .config import OpenAIConfig
 
 
 class OpenAIProviderDataValidator(BaseModel):
-    openai_api_key: Optional[str] = None
+    openai_api_key: str | None = None
 
 
 async def get_adapter_impl(config: OpenAIConfig, _deps):
diff --git a/llama_stack/providers/remote/inference/openai/config.py b/llama_stack/providers/remote/inference/openai/config.py
index 2b0cc2c10..17fb98831 100644
--- a/llama_stack/providers/remote/inference/openai/config.py
+++ b/llama_stack/providers/remote/inference/openai/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class OpenAIProviderDataValidator(BaseModel):
-    openai_api_key: Optional[str] = Field(
+    openai_api_key: str | None = Field(
         default=None,
         description="API key for OpenAI models",
     )
@@ -20,13 +20,13 @@ class OpenAIProviderDataValidator(BaseModel):
 
 @json_schema_type
 class OpenAIConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="API key for OpenAI models",
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "api_key": api_key,
         }
diff --git a/llama_stack/providers/remote/inference/openai/models.py b/llama_stack/providers/remote/inference/openai/models.py
index 1737043a4..e029c456c 100644
--- a/llama_stack/providers/remote/inference/openai/models.py
+++ b/llama_stack/providers/remote/inference/openai/models.py
@@ -4,27 +4,60 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from dataclasses import dataclass
+
 from llama_stack.apis.models.models import ModelType
 from llama_stack.providers.utils.inference.model_registry import (
     ProviderModelEntry,
 )
 
 LLM_MODEL_IDS = [
+    # the models w/ "openai/" prefix are the litellm specific model names.
+    # they should be deprecated in favor of the canonical openai model names.
     "openai/gpt-4o",
     "openai/gpt-4o-mini",
     "openai/chatgpt-4o-latest",
+    "gpt-3.5-turbo-0125",
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-instruct",
+    "gpt-4",
+    "gpt-4-turbo",
+    "gpt-4o",
+    "gpt-4o-2024-08-06",
+    "gpt-4o-mini",
+    "gpt-4o-audio-preview",
+    "chatgpt-4o-latest",
+    "o1",
+    "o1-mini",
+    "o3-mini",
+    "o4-mini",
 ]
 
 
+@dataclass
+class EmbeddingModelInfo:
+    """Structured representation of embedding model information."""
+
+    embedding_dimension: int
+    context_length: int
+
+
+EMBEDDING_MODEL_IDS: dict[str, EmbeddingModelInfo] = {
+    "openai/text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
+    "openai/text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
+    "text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
+    "text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
+}
+
+
 MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + [
     ProviderModelEntry(
-        provider_model_id="openai/text-embedding-3-small",
+        provider_model_id=model_id,
         model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 1536, "context_length": 8192},
-    ),
-    ProviderModelEntry(
-        provider_model_id="openai/text-embedding-3-large",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 3072, "context_length": 8192},
-    ),
+        metadata={
+            "embedding_dimension": model_info.embedding_dimension,
+            "context_length": model_info.context_length,
+        },
+    )
+    for model_id, model_info in EMBEDDING_MODEL_IDS.items()
 ]
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 6b9c02e6c..6f3a686a8 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -4,12 +4,45 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import logging
+from collections.abc import AsyncIterator
+from typing import Any
+
+from openai import AsyncOpenAI
+
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 
 from .config import OpenAIConfig
 from .models import MODEL_ENTRIES
 
+logger = logging.getLogger(__name__)
 
+
+#
+# This OpenAI adapter implements Inference methods using two clients -
+#
+# | Inference Method           | Implementation Source    |
+# |----------------------------|--------------------------|
+# | completion                 | LiteLLMOpenAIMixin       |
+# | chat_completion            | LiteLLMOpenAIMixin       |
+# | embedding                  | LiteLLMOpenAIMixin       |
+# | batch_completion           | LiteLLMOpenAIMixin       |
+# | batch_chat_completion      | LiteLLMOpenAIMixin       |
+# | openai_completion          | AsyncOpenAI              |
+# | openai_chat_completion     | AsyncOpenAI              |
+# | openai_embeddings          | AsyncOpenAI              |
+#
 class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
     def __init__(self, config: OpenAIConfig) -> None:
         LiteLLMOpenAIMixin.__init__(
@@ -19,9 +52,174 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             provider_data_api_key_field="openai_api_key",
         )
         self.config = config
+        # we set is_openai_compat so users can use the canonical
+        # openai model names like "gpt-4" or "gpt-3.5-turbo"
+        # and the model name will be translated to litellm's
+        # "openai/gpt-4" or "openai/gpt-3.5-turbo" transparently.
+        # if we do not set this, users will be exposed to the
+        # litellm specific model names, an abstraction leak.
+        self.is_openai_compat = True
+        self._openai_client = AsyncOpenAI(
+            api_key=self.config.api_key,
+        )
 
     async def initialize(self) -> None:
         await super().initialize()
 
     async def shutdown(self) -> None:
         await super().shutdown()
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
+    ) -> OpenAICompletion:
+        if guided_choice is not None:
+            logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
+        if prompt_logprobs is not None:
+            logging.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
+
+        model_id = (await self.model_store.get_model(model)).provider_resource_id
+        if model_id.startswith("openai/"):
+            model_id = model_id[len("openai/") :]
+        params = await prepare_openai_completion_params(
+            model=model_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._openai_client.completions.create(**params)
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
+        model_id = (await self.model_store.get_model(model)).provider_resource_id
+        if model_id.startswith("openai/"):
+            model_id = model_id[len("openai/") :]
+        params = await prepare_openai_completion_params(
+            model=model_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._openai_client.chat.completions.create(**params)
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        model_id = (await self.model_store.get_model(model)).provider_resource_id
+        if model_id.startswith("openai/"):
+            model_id = model_id[len("openai/") :]
+
+        # Prepare parameters for OpenAI embeddings API
+        params = {
+            "model": model_id,
+            "input": input,
+        }
+
+        if encoding_format is not None:
+            params["encoding_format"] = encoding_format
+        if dimensions is not None:
+            params["dimensions"] = dimensions
+        if user is not None:
+            params["user"] = user
+
+        # Call OpenAI embeddings API
+        response = await self._openai_client.embeddings.create(**params)
+
+        data = []
+        for i, embedding_data in enumerate(response.data):
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_data.embedding,
+                    index=i,
+                )
+            )
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=response.model,
+            usage=usage,
+        )
diff --git a/llama_stack/providers/remote/inference/passthrough/config.py b/llama_stack/providers/remote/inference/passthrough/config.py
index 46325e428..ce41495ce 100644
--- a/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/llama_stack/providers/remote/inference/passthrough/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -18,13 +18,13 @@ class PassthroughImplConfig(BaseModel):
         description="The URL for the passthrough endpoint",
     )
 
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default=None,
         description="API Key for the passthrouth endpoint",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "url": "${env.PASSTHROUGH_URL}",
             "api_key": "${env.PASSTHROUGH_API_KEY}",
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index af05320b0..6cf4680e2 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 from llama_stack_client import AsyncLlamaStackClient
 
@@ -18,6 +19,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -93,10 +95,10 @@ class PassthroughInferenceAdapter(Inference):
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -123,15 +125,15 @@ class PassthroughInferenceAdapter(Inference):
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -165,7 +167,7 @@ class PassthroughInferenceAdapter(Inference):
         else:
             return await self._nonstream_chat_completion(json_params)
 
-    async def _nonstream_chat_completion(self, json_params: Dict[str, Any]) -> ChatCompletionResponse:
+    async def _nonstream_chat_completion(self, json_params: dict[str, Any]) -> ChatCompletionResponse:
         client = self._get_client()
         response = await client.inference.chat_completion(**json_params)
 
@@ -178,7 +180,7 @@ class PassthroughInferenceAdapter(Inference):
             logprobs=response.logprobs,
         )
 
-    async def _stream_chat_completion(self, json_params: Dict[str, Any]) -> AsyncGenerator:
+    async def _stream_chat_completion(self, json_params: dict[str, Any]) -> AsyncGenerator:
         client = self._get_client()
         stream_response = await client.inference.chat_completion(**json_params)
 
@@ -193,10 +195,10 @@ class PassthroughInferenceAdapter(Inference):
     async def embeddings(
         self,
         model_id: str,
-        contents: List[InterleavedContent],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[InterleavedContent],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         client = self._get_client()
         model = await self.model_store.get_model(model_id)
@@ -209,27 +211,37 @@ class PassthroughInferenceAdapter(Inference):
             task_type=task_type,
         )
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         client = self._get_client()
         model_obj = await self.model_store.get_model(model)
@@ -261,29 +273,29 @@ class PassthroughInferenceAdapter(Inference):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         client = self._get_client()
         model_obj = await self.model_store.get_model(model)
 
@@ -315,7 +327,7 @@ class PassthroughInferenceAdapter(Inference):
 
         return await client.inference.openai_chat_completion(**params)
 
-    def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
+    def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]:
         json_params = {}
         for key, value in request_params.items():
             json_input = convert_pydantic_to_json_value(value)
diff --git a/llama_stack/providers/remote/inference/runpod/config.py b/llama_stack/providers/remote/inference/runpod/config.py
index 377a7fe6a..e3913dc35 100644
--- a/llama_stack/providers/remote/inference/runpod/config.py
+++ b/llama_stack/providers/remote/inference/runpod/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -13,17 +13,17 @@ from llama_stack.schema_utils import json_schema_type
 
 @json_schema_type
 class RunpodImplConfig(BaseModel):
-    url: Optional[str] = Field(
+    url: str | None = Field(
         default=None,
         description="The URL for the Runpod model serving endpoint",
     )
-    api_token: Optional[str] = Field(
+    api_token: str | None = Field(
         default=None,
         description="The API token",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
         return {
             "url": "${env.RUNPOD_URL:}",
             "api_token": "${env.RUNPOD_API_TOKEN:}",
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 72cbead9b..f8c98893e 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -3,11 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncGenerator
+from collections.abc import AsyncGenerator
 
 from openai import OpenAI
 
 from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.inference.inference import OpenAIEmbeddingsResponse
 
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
@@ -134,3 +135,13 @@ class RunpodInferenceAdapter(
         task_type: Optional[EmbeddingTaskType] = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
diff --git a/llama_stack/providers/remote/inference/sambanova/__init__.py b/llama_stack/providers/remote/inference/sambanova/__init__.py
index 3e682e69c..a3a7b8fbd 100644
--- a/llama_stack/providers/remote/inference/sambanova/__init__.py
+++ b/llama_stack/providers/remote/inference/sambanova/__init__.py
@@ -4,16 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from pydantic import BaseModel
+from llama_stack.apis.inference import Inference
 
 from .config import SambaNovaImplConfig
 
 
-class SambaNovaProviderDataValidator(BaseModel):
-    sambanova_api_key: str
-
-
-async def get_adapter_impl(config: SambaNovaImplConfig, _deps):
+async def get_adapter_impl(config: SambaNovaImplConfig, _deps) -> Inference:
     from .sambanova import SambaNovaInferenceAdapter
 
     assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/remote/inference/sambanova/config.py b/llama_stack/providers/remote/inference/sambanova/config.py
index a30c29b74..abbf9430f 100644
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@@ -4,27 +4,34 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, SecretStr
 
 from llama_stack.schema_utils import json_schema_type
 
 
+class SambaNovaProviderDataValidator(BaseModel):
+    sambanova_api_key: str | None = Field(
+        default=None,
+        description="Sambanova Cloud API key",
+    )
+
+
 @json_schema_type
 class SambaNovaImplConfig(BaseModel):
     url: str = Field(
         default="https://api.sambanova.ai/v1",
         description="The URL for the SambaNova AI server",
     )
-    api_key: Optional[str] = Field(
+    api_key: SecretStr | None = Field(
         default=None,
-        description="The SambaNova.ai API Key",
+        description="The SambaNova cloud API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "url": "https://api.sambanova.ai/v1",
-            "api_key": "${env.SAMBANOVA_API_KEY}",
+            "api_key": api_key,
         }
diff --git a/llama_stack/providers/remote/inference/sambanova/models.py b/llama_stack/providers/remote/inference/sambanova/models.py
index 43041e94a..9954fa7a0 100644
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@@ -11,43 +11,43 @@ from llama_stack.providers.utils.inference.model_registry import (
 
 MODEL_ENTRIES = [
     build_hf_repo_model_entry(
-        "Meta-Llama-3.1-8B-Instruct",
+        "sambanova/Meta-Llama-3.1-8B-Instruct",
         CoreModelId.llama3_1_8b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.1-70B-Instruct",
-        CoreModelId.llama3_1_70b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "Meta-Llama-3.1-405B-Instruct",
+        "sambanova/Meta-Llama-3.1-405B-Instruct",
         CoreModelId.llama3_1_405b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.2-1B-Instruct",
+        "sambanova/Meta-Llama-3.2-1B-Instruct",
         CoreModelId.llama3_2_1b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.2-3B-Instruct",
+        "sambanova/Meta-Llama-3.2-3B-Instruct",
         CoreModelId.llama3_2_3b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.3-70B-Instruct",
+        "sambanova/Meta-Llama-3.3-70B-Instruct",
         CoreModelId.llama3_3_70b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Llama-3.2-11B-Vision-Instruct",
+        "sambanova/Llama-3.2-11B-Vision-Instruct",
         CoreModelId.llama3_2_11b_vision_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Llama-3.2-90B-Vision-Instruct",
+        "sambanova/Llama-3.2-90B-Vision-Instruct",
         CoreModelId.llama3_2_90b_vision_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-    build_hf_repo_model_entry(
-        "Llama-4-Scout-17B-16E-Instruct",
+        "sambanova/Llama-4-Scout-17B-16E-Instruct",
         CoreModelId.llama4_scout_17b_16e_instruct.value,
     ),
+    build_hf_repo_model_entry(
+        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "sambanova/Meta-Llama-Guard-3-8B",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
 ]
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index 1665e72b8..20f863665 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -5,305 +5,249 @@
 # the root directory of this source tree.
 
 import json
-from typing import AsyncGenerator, List, Optional
+from collections.abc import Iterable
 
-from openai import OpenAI
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartImageParam as OpenAIChatCompletionContentPartImageParam,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
+)
+from openai.types.chat import (
+    ChatCompletionMessageParam as OpenAIChatCompletionMessage,
+)
+from openai.types.chat import (
+    ChatCompletionMessageToolCallParam as OpenAIChatCompletionMessageToolCall,
+)
+from openai.types.chat import (
+    ChatCompletionSystemMessageParam as OpenAIChatCompletionSystemMessage,
+)
+from openai.types.chat import (
+    ChatCompletionToolMessageParam as OpenAIChatCompletionToolMessage,
+)
+from openai.types.chat import (
+    ChatCompletionUserMessageParam as OpenAIChatCompletionUserMessage,
+)
+from openai.types.chat.chat_completion_content_part_image_param import (
+    ImageURL as OpenAIImageURL,
+)
+from openai.types.chat.chat_completion_message_tool_call_param import (
+    Function as OpenAIFunction,
+)
 
 from llama_stack.apis.common.content_types import (
     ImageContentItem,
     InterleavedContent,
-    InterleavedContentItem,
     TextContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
-    ChatCompletionResponse,
     CompletionMessage,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
-    GreedySamplingStrategy,
-    Inference,
-    LogProbConfig,
+    JsonSchemaResponseFormat,
     Message,
-    ResponseFormat,
-    SamplingParams,
-    StopReason,
     SystemMessage,
-    TextTruncation,
-    ToolCall,
     ToolChoice,
-    ToolConfig,
-    ToolDefinition,
-    ToolPromptFormat,
     ToolResponseMessage,
-    TopKSamplingStrategy,
-    TopPSamplingStrategy,
     UserMessage,
 )
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import BuiltinTool
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    convert_image_content_to_url,
+    convert_tooldef_to_openai_tool,
+    get_sampling_options,
 )
+from llama_stack.providers.utils.inference.prompt_adapter import convert_image_content_to_url
 
 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
 
+logger = get_logger(name=__name__, category="inference")
 
-class SambaNovaInferenceAdapter(
-    ModelRegistryHelper,
-    Inference,
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
-):
-    def __init__(self, config: SambaNovaImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
-        self.config = config
 
-    async def initialize(self) -> None:
-        return
+async def convert_message_to_openai_dict_with_b64_images(
+    message: Message | dict,
+) -> OpenAIChatCompletionMessage:
+    """
+    Convert a Message to an OpenAI API-compatible dictionary.
+    """
+    # users can supply a dict instead of a Message object, we'll
+    # convert it to a Message object and proceed with some type safety.
+    if isinstance(message, dict):
+        if "role" not in message:
+            raise ValueError("role is required in message")
+        if message["role"] == "user":
+            message = UserMessage(**message)
+        elif message["role"] == "assistant":
+            message = CompletionMessage(**message)
+        elif message["role"] == "tool":
+            message = ToolResponseMessage(**message)
+        elif message["role"] == "system":
+            message = SystemMessage(**message)
+        else:
+            raise ValueError(f"Unsupported message role: {message['role']}")
 
-    async def shutdown(self) -> None:
-        pass
-
-    def _get_client(self) -> OpenAI:
-        return OpenAI(base_url=self.config.url, api_key=self.config.api_key)
-
-    async def completion(
-        self,
-        model_id: str,
+    # Map Llama Stack spec to OpenAI spec -
+    #  str -> str
+    #  {"type": "text", "text": ...} -> {"type": "text", "text": ...}
+    #  {"type": "image", "image": {"url": {"uri": ...}}} -> {"type": "image_url", "image_url": {"url": ...}}
+    #  {"type": "image", "image": {"data": ...}} -> {"type": "image_url", "image_url": {"url": "data:image/?;base64,..."}}
+    #  List[...] -> List[...]
+    async def _convert_message_content(
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        raise NotImplementedError()
-
-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        tool_config: Optional[ToolConfig] = None,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> AsyncGenerator:
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        model = await self.model_store.get_model(model_id)
-
-        request = ChatCompletionRequest(
-            model=model.provider_resource_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
-        request_sambanova = await self.convert_chat_completion_request(request)
-
-        if stream:
-            return self._stream_chat_completion(request_sambanova)
-        else:
-            return await self._nonstream_chat_completion(request_sambanova)
-
-    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
-        response = self._get_client().chat.completions.create(**request)
-
-        choice = response.choices[0]
-
-        result = ChatCompletionResponse(
-            completion_message=CompletionMessage(
-                content=choice.message.content or "",
-                stop_reason=self.convert_to_sambanova_finish_reason(choice.finish_reason),
-                tool_calls=self.convert_to_sambanova_tool_calls(choice.message.tool_calls),
-            ),
-            logprobs=None,
-        )
-
-        return result
-
-    async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
-        async def _to_async_generator():
-            streaming = self._get_client().chat.completions.create(**request)
-            for chunk in streaming:
-                yield chunk
-
-        stream = _to_async_generator()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            yield chunk
-
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
-    async def convert_chat_completion_request(self, request: ChatCompletionRequest) -> dict:
-        compatible_request = self.convert_sampling_params(request.sampling_params)
-        compatible_request["model"] = request.model
-        compatible_request["messages"] = await self.convert_to_sambanova_messages(request.messages)
-        compatible_request["stream"] = request.stream
-        compatible_request["logprobs"] = False
-        compatible_request["extra_headers"] = {
-            b"User-Agent": b"llama-stack: sambanova-inference-adapter",
-        }
-        compatible_request["tools"] = self.convert_to_sambanova_tool(request.tools)
-        return compatible_request
-
-    def convert_sampling_params(self, sampling_params: SamplingParams, legacy: bool = False) -> dict:
-        params = {}
-
-        if sampling_params:
-            params["frequency_penalty"] = sampling_params.repetition_penalty
-
-            if sampling_params.max_tokens:
-                if legacy:
-                    params["max_tokens"] = sampling_params.max_tokens
-                else:
-                    params["max_completion_tokens"] = sampling_params.max_tokens
-
-            if isinstance(sampling_params.strategy, TopPSamplingStrategy):
-                params["top_p"] = sampling_params.strategy.top_p
-            if isinstance(sampling_params.strategy, TopKSamplingStrategy):
-                params["extra_body"]["top_k"] = sampling_params.strategy.top_k
-            if isinstance(sampling_params.strategy, GreedySamplingStrategy):
-                params["temperature"] = 0.0
-
-        return params
-
-    async def convert_to_sambanova_messages(self, messages: List[Message]) -> List[dict]:
-        conversation = []
-        for message in messages:
-            content = {}
-
-            content["content"] = await self.convert_to_sambanova_content(message)
-
-            if isinstance(message, UserMessage):
-                content["role"] = "user"
-            elif isinstance(message, CompletionMessage):
-                content["role"] = "assistant"
-                tools = []
-                for tool_call in message.tool_calls:
-                    tools.append(
-                        {
-                            "id": tool_call.call_id,
-                            "function": {
-                                "name": tool_call.name,
-                                "arguments": json.dumps(tool_call.arguments),
-                            },
-                            "type": "function",
-                        }
-                    )
-                content["tool_calls"] = tools
-            elif isinstance(message, ToolResponseMessage):
-                content["role"] = "tool"
-                content["tool_call_id"] = message.call_id
-            elif isinstance(message, SystemMessage):
-                content["role"] = "system"
-
-            conversation.append(content)
-
-        return conversation
-
-    async def convert_to_sambanova_content(self, message: Message) -> dict:
-        async def _convert_content(content) -> dict:
-            if isinstance(content, ImageContentItem):
-                url = await convert_image_content_to_url(content, download=True)
-                # A fix to make sure the call sucess.
-                components = url.split(";base64")
-                url = f"{components[0].lower()};base64{components[1]}"
-                return {
-                    "type": "image_url",
-                    "image_url": {"url": url},
-                }
+    ) -> str | Iterable[OpenAIChatCompletionContentPartParam]:
+        async def impl(
+            content_: InterleavedContent,
+        ) -> str | OpenAIChatCompletionContentPartParam | list[OpenAIChatCompletionContentPartParam]:
+            # Llama Stack and OpenAI spec match for str and text input
+            if isinstance(content_, str):
+                return content_
+            elif isinstance(content_, TextContentItem):
+                return OpenAIChatCompletionContentPartTextParam(
+                    type="text",
+                    text=content_.text,
+                )
+            elif isinstance(content_, ImageContentItem):
+                return OpenAIChatCompletionContentPartImageParam(
+                    type="image_url",
+                    image_url=OpenAIImageURL(url=await convert_image_content_to_url(content_, download=True)),
+                )
+            elif isinstance(content_, list):
+                return [await impl(item) for item in content_]
             else:
-                text = content.text if isinstance(content, TextContentItem) else content
-                assert isinstance(text, str)
-                return {"type": "text", "text": text}
+                raise ValueError(f"Unsupported content type: {type(content_)}")
 
-        if isinstance(message.content, list):
-            # If it is a list, the text content should be wrapped in dict
-            content = [await _convert_content(c) for c in message.content]
+        ret = await impl(content)
+
+        # OpenAI*Message expects a str or list
+        if isinstance(ret, str) or isinstance(ret, list):
+            return ret
         else:
-            content = message.content
+            return [ret]
 
-        return content
+    out: OpenAIChatCompletionMessage = None
+    if isinstance(message, UserMessage):
+        out = OpenAIChatCompletionUserMessage(
+            role="user",
+            content=await _convert_message_content(message.content),
+        )
+    elif isinstance(message, CompletionMessage):
+        out = OpenAIChatCompletionAssistantMessage(
+            role="assistant",
+            content=await _convert_message_content(message.content),
+            tool_calls=[
+                OpenAIChatCompletionMessageToolCall(
+                    id=tool.call_id,
+                    function=OpenAIFunction(
+                        name=tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value,
+                        arguments=json.dumps(tool.arguments),
+                    ),
+                    type="function",
+                )
+                for tool in message.tool_calls
+            ]
+            or None,
+        )
+    elif isinstance(message, ToolResponseMessage):
+        out = OpenAIChatCompletionToolMessage(
+            role="tool",
+            tool_call_id=message.call_id,
+            content=await _convert_message_content(message.content),
+        )
+    elif isinstance(message, SystemMessage):
+        out = OpenAIChatCompletionSystemMessage(
+            role="system",
+            content=await _convert_message_content(message.content),
+        )
+    else:
+        raise ValueError(f"Unsupported message type: {type(message)}")
 
-    def convert_to_sambanova_tool(self, tools: List[ToolDefinition]) -> List[dict]:
-        if tools is None:
-            return tools
+    return out
 
-        compatiable_tools = []
 
-        for tool in tools:
-            properties = {}
-            compatiable_required = []
-            if tool.parameters:
-                for tool_key, tool_param in tool.parameters.items():
-                    properties[tool_key] = {"type": tool_param.param_type}
-                    if tool_param.description:
-                        properties[tool_key]["description"] = tool_param.description
-                    if tool_param.default:
-                        properties[tool_key]["default"] = tool_param.default
-                    if tool_param.required:
-                        compatiable_required.append(tool_key)
+class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: SambaNovaImplConfig
 
-            compatiable_tool = {
-                "type": "function",
-                "function": {
-                    "name": tool.tool_name,
-                    "description": tool.description,
-                    "parameters": {
-                        "type": "object",
-                        "properties": properties,
-                        "required": compatiable_required,
-                    },
+    def __init__(self, config: SambaNovaImplConfig):
+        self.config = config
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=self.config.api_key,
+            provider_data_api_key_field="sambanova_api_key",
+        )
+
+    def _get_api_key(self) -> str:
+        config_api_key = self.config.api_key if self.config.api_key else None
+        if config_api_key:
+            return config_api_key.get_secret_value()
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.sambanova_api_key:
+                raise ValueError(
+                    'Pass Sambanova API Key in the header X-LlamaStack-Provider-Data as { "sambanova_api_key":  }'
+                )
+            return provider_data.sambanova_api_key
+
+    async def _get_params(self, request: ChatCompletionRequest) -> dict:
+        input_dict = {}
+
+        input_dict["messages"] = [await convert_message_to_openai_dict_with_b64_images(m) for m in request.messages]
+        if fmt := request.response_format:
+            if not isinstance(fmt, JsonSchemaResponseFormat):
+                raise ValueError(
+                    f"Unsupported response format: {type(fmt)}. Only JsonSchemaResponseFormat is supported."
+                )
+
+            fmt = fmt.json_schema
+            name = fmt["title"]
+            del fmt["title"]
+            fmt["additionalProperties"] = False
+
+            # Apply additionalProperties: False recursively to all objects
+            fmt = self._add_additional_properties_recursive(fmt)
+
+            input_dict["response_format"] = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": name,
+                    "schema": fmt,
+                    "strict": False,
                 },
             }
+        if request.tools:
+            input_dict["tools"] = [convert_tooldef_to_openai_tool(tool) for tool in request.tools]
+            if request.tool_config.tool_choice:
+                input_dict["tool_choice"] = (
+                    request.tool_config.tool_choice.value
+                    if isinstance(request.tool_config.tool_choice, ToolChoice)
+                    else request.tool_config.tool_choice
+                )
 
-            compatiable_tools.append(compatiable_tool)
+        provider_data = self.get_request_provider_data()
+        key_field = self.provider_data_api_key_field
+        if provider_data and getattr(provider_data, key_field, None):
+            api_key = getattr(provider_data, key_field)
+        else:
+            api_key = self._get_api_key()
 
-        if len(compatiable_tools) > 0:
-            return compatiable_tools
-        return None
-
-    def convert_to_sambanova_finish_reason(self, finish_reason: str) -> StopReason:
         return {
-            "stop": StopReason.end_of_turn,
-            "length": StopReason.out_of_tokens,
-            "tool_calls": StopReason.end_of_message,
-        }.get(finish_reason, StopReason.end_of_turn)
+            "model": request.model,
+            "api_key": api_key,
+            "api_base": self.config.url,
+            **input_dict,
+            "stream": request.stream,
+            **get_sampling_options(request.sampling_params),
+        }
 
-    def convert_to_sambanova_tool_calls(
-        self,
-        tool_calls,
-    ) -> List[ToolCall]:
-        if not tool_calls:
-            return []
+    async def initialize(self):
+        await super().initialize()
 
-        compitable_tool_calls = [
-            ToolCall(
-                call_id=call.id,
-                tool_name=call.function.name,
-                arguments=json.loads(call.function.arguments),
-                arguments_json=call.function.arguments,
-            )
-            for call in tool_calls
-        ]
-
-        return compitable_tool_calls
+    async def shutdown(self):
+        await super().shutdown()
diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
index e31a3364c..60afe91ca 100644
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import SambaNovaCompatConfig
 
 
-async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .sambanova import SambaNovaCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
index b792cb6e7..072fa85d1 100644
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class SambaNovaProviderDataValidator(BaseModel):
-    sambanova_api_key: Optional[str] = Field(
+    sambanova_api_key: str | None = Field(
         default=None,
         description="API key for SambaNova models",
     )
@@ -20,7 +20,7 @@ class SambaNovaProviderDataValidator(BaseModel):
 
 @json_schema_type
 class SambaNovaCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The SambaNova API key",
     )
@@ -31,7 +31,7 @@ class SambaNovaCompatConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "openai_compat_api_base": "https://api.sambanova.ai/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/tgi/__init__.py b/llama_stack/providers/remote/inference/tgi/__init__.py
index 834e51324..51614f1a6 100644
--- a/llama_stack/providers/remote/inference/tgi/__init__.py
+++ b/llama_stack/providers/remote/inference/tgi/__init__.py
@@ -4,13 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Union
-
 from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig
 
 
 async def get_adapter_impl(
-    config: Union[InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig],
+    config: InferenceAPIImplConfig | InferenceEndpointImplConfig | TGIImplConfig,
     _deps,
 ):
     from .tgi import InferenceAPIAdapter, InferenceEndpointAdapter, TGIAdapter
diff --git a/llama_stack/providers/remote/inference/tgi/config.py b/llama_stack/providers/remote/inference/tgi/config.py
index 6ad663662..3d632c9d8 100644
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -29,7 +28,7 @@ class InferenceEndpointImplConfig(BaseModel):
     endpoint_name: str = Field(
         description="The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided.",
     )
-    api_token: Optional[SecretStr] = Field(
+    api_token: SecretStr | None = Field(
         default=None,
         description="Your Hugging Face user access token (will default to locally saved token if not provided)",
     )
@@ -52,7 +51,7 @@ class InferenceAPIImplConfig(BaseModel):
     huggingface_repo: str = Field(
         description="The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct')",
     )
-    api_token: Optional[SecretStr] = Field(
+    api_token: SecretStr | None = Field(
         default=None,
         description="Your Hugging Face user access token (will default to locally saved token if not provided)",
     )
diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 4ee386a15..292d74ef8 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -6,7 +6,7 @@
 
 
 import logging
-from typing import AsyncGenerator, List, Optional
+from collections.abc import AsyncGenerator
 
 from huggingface_hub import AsyncInferenceClient, HfApi
 
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
@@ -105,10 +106,10 @@ class _HfAdapter(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -134,7 +135,7 @@ class _HfAdapter(
 
     def _build_options(
         self,
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: SamplingParams | None = None,
         fmt: ResponseFormat = None,
     ):
         options = get_sampling_options(sampling_params)
@@ -209,15 +210,15 @@ class _HfAdapter(
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -284,13 +285,23 @@ class _HfAdapter(
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
 
 class TGIAdapter(_HfAdapter):
     async def initialize(self, config: TGIImplConfig) -> None:
diff --git a/llama_stack/providers/remote/inference/together/config.py b/llama_stack/providers/remote/inference/together/config.py
index fa7c45c9f..5c7f60519 100644
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -17,13 +17,13 @@ class TogetherImplConfig(BaseModel):
         default="https://api.together.xyz/v1",
         description="The URL for the Together AI server",
     )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default=None,
         description="The Together AI API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "url": "https://api.together.xyz/v1",
             "api_key": "${env.TOGETHER_API_KEY:}",
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 48e41f5b0..7305a638d 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 from openai import AsyncOpenAI
 from together import AsyncTogether
@@ -22,6 +23,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
@@ -86,10 +88,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -147,8 +149,8 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
 
     def _build_options(
         self,
-        sampling_params: Optional[SamplingParams],
-        logprobs: Optional[LogProbConfig],
+        sampling_params: SamplingParams | None,
+        logprobs: LogProbConfig | None,
         fmt: ResponseFormat,
     ) -> dict:
         options = get_sampling_options(sampling_params)
@@ -175,15 +177,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -224,7 +226,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         input_dict = {}
         media_present = request_has_media(request)
         llama_model = self.get_llama_model(request.model)
@@ -249,10 +251,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
         assert all(not content_has_media(content) for content in contents), (
@@ -266,27 +268,37 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         embeddings = [item.embedding for item in r.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         model_obj = await self.model_store.get_model(model)
         params = await prepare_openai_completion_params(
@@ -313,29 +325,29 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self.model_store.get_model(model)
         params = await prepare_openai_completion_params(
             model=model_obj.provider_resource_id,
diff --git a/llama_stack/providers/remote/inference/together_openai_compat/__init__.py b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
index 6fdf05b7e..8213fc5f4 100644
--- a/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import TogetherCompatConfig
 
 
-async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .together import TogetherCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/together_openai_compat/config.py b/llama_stack/providers/remote/inference/together_openai_compat/config.py
index 120adbed9..0c6d4f748 100644
--- a/llama_stack/providers/remote/inference/together_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type
 
 
 class TogetherProviderDataValidator(BaseModel):
-    together_api_key: Optional[str] = Field(
+    together_api_key: str | None = Field(
         default=None,
         description="API key for Together models",
     )
@@ -20,7 +20,7 @@ class TogetherProviderDataValidator(BaseModel):
 
 @json_schema_type
 class TogetherCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Together API key",
     )
@@ -31,7 +31,7 @@ class TogetherCompatConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "openai_compat_api_base": "https://api.together.xyz/v1",
             "api_key": api_key,
diff --git a/llama_stack/providers/remote/inference/vllm/config.py b/llama_stack/providers/remote/inference/vllm/config.py
index 762cffde3..99abddf51 100644
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from pathlib import Path
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
 from llama_stack.schema_utils import json_schema_type
 
 
 @json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
-    url: Optional[str] = Field(
+    url: str | None = Field(
         default=None,
         description="The URL for the vLLM model serving endpoint",
     )
@@ -21,15 +21,31 @@ class VLLMInferenceAdapterConfig(BaseModel):
         default=4096,
         description="Maximum number of tokens to generate.",
     )
-    api_token: Optional[str] = Field(
+    api_token: str | None = Field(
         default="fake",
         description="The API token",
     )
-    tls_verify: bool = Field(
+    tls_verify: bool | str = Field(
         default=True,
-        description="Whether to verify TLS certificates",
+        description="Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file.",
     )
 
+    @field_validator("tls_verify")
+    @classmethod
+    def validate_tls_verify(cls, v):
+        if isinstance(v, str):
+            # Check if it's a boolean string
+            if v.lower() in ("true", "false"):
+                return v.lower() == "true"
+            # Otherwise, treat it as a cert path
+            cert_path = Path(v).expanduser().resolve()
+            if not cert_path.exists():
+                raise ValueError(f"TLS certificate file does not exist: {v}")
+            if not cert_path.is_file():
+                raise ValueError(f"TLS certificate path is not a file: {v}")
+            return v
+        return v
+
     @classmethod
     def sample_run_config(
         cls,
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 8cfef2ee0..9f38d9abf 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -5,7 +5,8 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 import httpx
 from openai import AsyncOpenAI
@@ -37,6 +38,7 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -94,7 +96,7 @@ def build_hf_repo_model_entries():
 
 def _convert_to_vllm_tool_calls_in_response(
     tool_calls,
-) -> List[ToolCall]:
+) -> list[ToolCall]:
     if not tool_calls:
         return []
 
@@ -109,7 +111,7 @@ def _convert_to_vllm_tool_calls_in_response(
     ]
 
 
-def _convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict]:
+def _convert_to_vllm_tools_in_request(tools: list[ToolDefinition]) -> list[dict]:
     compat_tools = []
 
     for tool in tools:
@@ -157,27 +159,28 @@ def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
     }.get(finish_reason, StopReason.end_of_turn)
 
 
-async def _process_vllm_chat_completion_stream_response(
-    stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
-) -> AsyncGenerator:
-    event_type = ChatCompletionResponseEventType.start
-    tool_call_buf = UnparseableToolCall()
-    async for chunk in stream:
-        if not chunk.choices:
-            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
-            continue
-        choice = chunk.choices[0]
-        if choice.finish_reason:
-            args_str = tool_call_buf.arguments
-            args = None
-            try:
-                args = {} if not args_str else json.loads(args_str)
-            except Exception as e:
-                log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
-            if args:
-                yield ChatCompletionResponseStreamChunk(
+def _process_vllm_chat_completion_end_of_stream(
+    finish_reason: str | None,
+    last_chunk_content: str | None,
+    current_event_type: ChatCompletionResponseEventType,
+    tool_call_bufs: dict[str, UnparseableToolCall] | None = None,
+) -> list[OpenAIChatCompletionChunk]:
+    chunks = []
+
+    if finish_reason is not None:
+        stop_reason = _convert_to_vllm_finish_reason(finish_reason)
+    else:
+        stop_reason = StopReason.end_of_message
+
+    tool_call_bufs = tool_call_bufs or {}
+    for _index, tool_call_buf in sorted(tool_call_bufs.items()):
+        args_str = tool_call_buf.arguments or "{}"
+        try:
+            args = json.loads(args_str)
+            chunks.append(
+                ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
-                        event_type=event_type,
+                        event_type=current_event_type,
                         delta=ToolCallDelta(
                             tool_call=ToolCall(
                                 call_id=tool_call_buf.call_id,
@@ -189,8 +192,12 @@ async def _process_vllm_chat_completion_stream_response(
                         ),
                     )
                 )
-            elif args_str:
-                yield ChatCompletionResponseStreamChunk(
+            )
+        except Exception as e:
+            log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
+
+            chunks.append(
+                ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
                         event_type=ChatCompletionResponseEventType.progress,
                         delta=ToolCallDelta(
@@ -199,21 +206,62 @@ async def _process_vllm_chat_completion_stream_response(
                         ),
                     )
                 )
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.complete,
-                    delta=TextDelta(text=choice.delta.content or ""),
-                    logprobs=None,
-                    stop_reason=_convert_to_vllm_finish_reason(choice.finish_reason),
-                )
             )
-        elif choice.delta.tool_calls:
-            tool_call = convert_tool_call(choice.delta.tool_calls[0])
-            tool_call_buf.tool_name += str(tool_call.tool_name)
-            tool_call_buf.call_id += tool_call.call_id
-            # TODO: remove str() when dict type for 'arguments' is no longer allowed
-            tool_call_buf.arguments += str(tool_call.arguments)
-        else:
+
+    chunks.append(
+        ChatCompletionResponseStreamChunk(
+            event=ChatCompletionResponseEvent(
+                event_type=ChatCompletionResponseEventType.complete,
+                delta=TextDelta(text=last_chunk_content or ""),
+                logprobs=None,
+                stop_reason=stop_reason,
+            )
+        )
+    )
+
+    return chunks
+
+
+async def _process_vllm_chat_completion_stream_response(
+    stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
+) -> AsyncGenerator:
+    yield ChatCompletionResponseStreamChunk(
+        event=ChatCompletionResponseEvent(
+            event_type=ChatCompletionResponseEventType.start,
+            delta=TextDelta(text=""),
+        )
+    )
+    event_type = ChatCompletionResponseEventType.progress
+    tool_call_bufs: dict[str, UnparseableToolCall] = {}
+    end_of_stream_processed = False
+
+    async for chunk in stream:
+        if not chunk.choices:
+            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
+            return
+        choice = chunk.choices[0]
+        if choice.delta.tool_calls:
+            for delta_tool_call in choice.delta.tool_calls:
+                tool_call = convert_tool_call(delta_tool_call)
+                if delta_tool_call.index not in tool_call_bufs:
+                    tool_call_bufs[delta_tool_call.index] = UnparseableToolCall()
+                tool_call_buf = tool_call_bufs[delta_tool_call.index]
+                tool_call_buf.tool_name += str(tool_call.tool_name)
+                tool_call_buf.call_id += tool_call.call_id
+                tool_call_buf.arguments += (
+                    tool_call.arguments if isinstance(tool_call.arguments, str) else json.dumps(tool_call.arguments)
+                )
+        if choice.finish_reason:
+            chunks = _process_vllm_chat_completion_end_of_stream(
+                finish_reason=choice.finish_reason,
+                last_chunk_content=choice.delta.content,
+                current_event_type=event_type,
+                tool_call_bufs=tool_call_bufs,
+            )
+            for c in chunks:
+                yield c
+            end_of_stream_processed = True
+        elif not choice.delta.tool_calls:
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
                     event_type=event_type,
@@ -223,6 +271,17 @@ async def _process_vllm_chat_completion_stream_response(
             )
             event_type = ChatCompletionResponseEventType.progress
 
+    if end_of_stream_processed:
+        return
+
+    # the stream ended without a chunk containing finish_reason - we have to generate the
+    # respective completion chunks manually
+    chunks = _process_vllm_chat_completion_end_of_stream(
+        finish_reason=None, last_chunk_content=None, current_event_type=event_type, tool_call_bufs=tool_call_bufs
+    )
+    for c in chunks:
+        yield c
+
 
 class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
@@ -255,22 +314,24 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         return AsyncOpenAI(
             base_url=self.config.url,
             api_key=self.config.api_token,
-            http_client=None if self.config.tls_verify else httpx.AsyncClient(verify=False),
+            http_client=httpx.AsyncClient(verify=self.config.tls_verify),
         )
 
     async def completion(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
         self._lazy_initialize_client()
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = CompletionRequest(
             model=model.provider_resource_id,
             content=content,
@@ -287,20 +348,22 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
         self._lazy_initialize_client()
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         # This is to be consistent with OpenAI API and support vLLM <= v0.6.3
         # References:
         #   * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
@@ -372,7 +435,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         # self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
         # Changing this may lead to unpredictable behavior.
         client = self._create_client() if self.client is None else self.client
-        model = await self.register_helper.register_model(model)
+        try:
+            model = await self.register_helper.register_model(model)
+        except ValueError:
+            pass  # Ignore statically unknown model, will check live listing
         res = await client.models.list()
         available_models = [m.id async for m in res]
         if model.provider_resource_id not in available_models:
@@ -382,7 +448,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             )
         return model
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         options = get_sampling_options(request.sampling_params)
         if "max_tokens" not in options:
             options["max_tokens"] = self.config.max_tokens
@@ -419,10 +485,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         self._lazy_initialize_client()
         assert self.client is not None
@@ -442,32 +508,42 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         embeddings = [data.embedding for data in response.data]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         self._lazy_initialize_client()
         model_obj = await self._get_model(model)
 
-        extra_body: Dict[str, Any] = {}
+        extra_body: dict[str, Any] = {}
         if prompt_logprobs is not None and prompt_logprobs >= 0:
             extra_body["prompt_logprobs"] = prompt_logprobs
         if guided_choice:
@@ -498,29 +574,29 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         self._lazy_initialize_client()
         model_obj = await self._get_model(model)
         params = await prepare_openai_completion_params(
@@ -553,21 +629,21 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch completion is not supported for Ollama")
 
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch chat completion is not supported for Ollama")
diff --git a/llama_stack/providers/remote/inference/watsonx/config.py b/llama_stack/providers/remote/inference/watsonx/config.py
index 7ee99b7e0..5eda9c5c0 100644
--- a/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/llama_stack/providers/remote/inference/watsonx/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import os
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field, SecretStr
 
@@ -24,11 +24,11 @@ class WatsonXConfig(BaseModel):
         default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
         description="A base url for accessing the watsonx.ai",
     )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
         default_factory=lambda: os.getenv("WATSONX_API_KEY"),
         description="The watsonx API key, only needed of using the hosted service",
     )
-    project_id: Optional[str] = Field(
+    project_id: str | None = Field(
         default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"),
         description="The Project ID key, only needed of using the hosted service",
     )
@@ -38,7 +38,7 @@ class WatsonXConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "url": "${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}",
             "api_key": "${env.WATSONX_API_KEY:}",
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index d5d87ec01..59f5f5562 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import AsyncGenerator, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 from ibm_watson_machine_learning.foundation_models import Model
 from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from openai import AsyncOpenAI
 
 from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
 from llama_stack.apis.inference import (
@@ -19,6 +21,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingsResponse,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -27,10 +30,21 @@ from llama_stack.apis.inference import (
     ToolDefinition,
     ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    GreedySamplingStrategy,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
+)
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
+    prepare_openai_completion_params,
     process_chat_completion_response,
     process_chat_completion_stream_response,
     process_completion_response,
@@ -66,10 +80,10 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -95,6 +109,14 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
 
         return Model(model_id=model_id, credentials=credentials, project_id=project_id)
 
+    def _get_openai_client(self) -> AsyncOpenAI:
+        if not self._openai_client:
+            self._openai_client = AsyncOpenAI(
+                base_url=f"{self._config.url}/openai/v1",
+                api_key=self._config.api_key,
+            )
+        return self._openai_client
+
     async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
         params = await self._get_params(request)
         r = self._get_client(request.model).generate(**params)
@@ -132,15 +154,15 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -197,7 +219,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
         input_dict = {"params": {}}
         media_present = request_has_media(request)
         llama_model = self.get_llama_model(request.model)
@@ -213,36 +235,16 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
                 input_dict["params"][GenParams.MAX_NEW_TOKENS] = request.sampling_params.max_tokens
             if request.sampling_params.repetition_penalty:
                 input_dict["params"][GenParams.REPETITION_PENALTY] = request.sampling_params.repetition_penalty
-            if request.sampling_params.additional_params.get("top_p"):
-                input_dict["params"][GenParams.TOP_P] = request.sampling_params.additional_params["top_p"]
-            if request.sampling_params.additional_params.get("top_k"):
-                input_dict["params"][GenParams.TOP_K] = request.sampling_params.additional_params["top_k"]
-            if request.sampling_params.additional_params.get("temperature"):
-                input_dict["params"][GenParams.TEMPERATURE] = request.sampling_params.additional_params["temperature"]
-            if request.sampling_params.additional_params.get("length_penalty"):
-                input_dict["params"][GenParams.LENGTH_PENALTY] = request.sampling_params.additional_params[
-                    "length_penalty"
-                ]
-            if request.sampling_params.additional_params.get("random_seed"):
-                input_dict["params"][GenParams.RANDOM_SEED] = request.sampling_params.additional_params["random_seed"]
-            if request.sampling_params.additional_params.get("min_new_tokens"):
-                input_dict["params"][GenParams.MIN_NEW_TOKENS] = request.sampling_params.additional_params[
-                    "min_new_tokens"
-                ]
-            if request.sampling_params.additional_params.get("stop_sequences"):
-                input_dict["params"][GenParams.STOP_SEQUENCES] = request.sampling_params.additional_params[
-                    "stop_sequences"
-                ]
-            if request.sampling_params.additional_params.get("time_limit"):
-                input_dict["params"][GenParams.TIME_LIMIT] = request.sampling_params.additional_params["time_limit"]
-            if request.sampling_params.additional_params.get("truncate_input_tokens"):
-                input_dict["params"][GenParams.TRUNCATE_INPUT_TOKENS] = request.sampling_params.additional_params[
-                    "truncate_input_tokens"
-                ]
-            if request.sampling_params.additional_params.get("return_options"):
-                input_dict["params"][GenParams.RETURN_OPTIONS] = request.sampling_params.additional_params[
-                    "return_options"
-                ]
+
+            if isinstance(request.sampling_params.strategy, TopPSamplingStrategy):
+                input_dict["params"][GenParams.TOP_P] = request.sampling_params.strategy.top_p
+                input_dict["params"][GenParams.TEMPERATURE] = request.sampling_params.strategy.temperature
+            if isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
+                input_dict["params"][GenParams.TOP_K] = request.sampling_params.strategy.top_k
+            if isinstance(request.sampling_params.strategy, GreedySamplingStrategy):
+                input_dict["params"][GenParams.TEMPERATURE] = 0.0
+
+        input_dict["params"][GenParams.STOP_SEQUENCES] = ["<|endoftext|>"]
 
         params = {
             **input_dict,
@@ -252,9 +254,137 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
-        pass
+        raise NotImplementedError("embedding is not supported for watsonx")
+
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
+    ) -> OpenAICompletion:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._get_openai_client().completions.create(**params)  # type: ignore
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        if params.get("stream", False):
+            return self._stream_openai_chat_completion(params)
+        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
+
+    async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
+        # watsonx.ai sometimes adds usage data to the stream
+        include_usage = False
+        if params.get("stream_options", None):
+            include_usage = params["stream_options"].get("include_usage", False)
+        stream = await self._get_openai_client().chat.completions.create(**params)
+
+        seen_finish_reason = False
+        async for chunk in stream:
+            # Final usage chunk with no choices that the user didn't request, so discard
+            if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
+                break
+            yield chunk
+            for choice in chunk.choices:
+                if choice.finish_reason:
+                    seen_finish_reason = True
+                    break
diff --git a/llama_stack/providers/remote/post_training/nvidia/config.py b/llama_stack/providers/remote/post_training/nvidia/config.py
index 7b42c8bb0..fa08b6e3f 100644
--- a/llama_stack/providers/remote/post_training/nvidia/config.py
+++ b/llama_stack/providers/remote/post_training/nvidia/config.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import os
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -15,23 +15,23 @@ from pydantic import BaseModel, Field
 class NvidiaPostTrainingConfig(BaseModel):
     """Configuration for NVIDIA Post Training implementation."""
 
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
         description="The NVIDIA API key.",
     )
 
-    dataset_namespace: Optional[str] = Field(
+    dataset_namespace: str | None = Field(
         default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
         description="The NVIDIA dataset namespace.",
     )
 
-    project_id: Optional[str] = Field(
+    project_id: str | None = Field(
         default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-example-model@v1"),
         description="The NVIDIA project ID.",
     )
 
     # ToDO: validate this, add default value
-    customizer_url: Optional[str] = Field(
+    customizer_url: str | None = Field(
         default_factory=lambda: os.getenv("NVIDIA_CUSTOMIZER_URL"),
         description="Base URL for the NeMo Customizer API",
     )
@@ -53,7 +53,7 @@ class NvidiaPostTrainingConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "api_key": "${env.NVIDIA_API_KEY:}",
             "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
@@ -71,27 +71,27 @@ class SFTLoRADefaultConfig(BaseModel):
     n_epochs: int = 50
 
     # NeMo customizer specific parameters
-    log_every_n_steps: Optional[int] = None
+    log_every_n_steps: int | None = None
     val_check_interval: float = 0.25
     sequence_packing_enabled: bool = False
     weight_decay: float = 0.01
     lr: float = 0.0001
 
     # SFT specific parameters
-    hidden_dropout: Optional[float] = None
-    attention_dropout: Optional[float] = None
-    ffn_dropout: Optional[float] = None
+    hidden_dropout: float | None = None
+    attention_dropout: float | None = None
+    ffn_dropout: float | None = None
 
     # LoRA default parameters
     lora_adapter_dim: int = 8
-    lora_adapter_dropout: Optional[float] = None
+    lora_adapter_dropout: float | None = None
     lora_alpha: int = 16
 
     # Data config
     batch_size: int = 8
 
     @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
+    def sample_config(cls) -> dict[str, Any]:
         """Return a sample configuration for NVIDIA training."""
         return {
             "n_epochs": 50,
diff --git a/llama_stack/providers/remote/post_training/nvidia/models.py b/llama_stack/providers/remote/post_training/nvidia/models.py
index 1b31b4dbe..6a28f8af8 100644
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
 
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
@@ -24,5 +23,5 @@ _MODEL_ENTRIES = [
 ]
 
 
-def get_model_entries() -> List[ProviderModelEntry]:
+def get_model_entries() -> list[ProviderModelEntry]:
     return _MODEL_ENTRIES
diff --git a/llama_stack/providers/remote/post_training/nvidia/post_training.py b/llama_stack/providers/remote/post_training/nvidia/post_training.py
index c74fb2a24..d839ffd6f 100644
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import warnings
 from datetime import datetime
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Literal
 
 import aiohttp
 from pydantic import BaseModel, ConfigDict
@@ -50,7 +50,7 @@ class NvidiaPostTrainingJob(PostTrainingJob):
 
 
 class ListNvidiaPostTrainingJobs(BaseModel):
-    data: List[NvidiaPostTrainingJob]
+    data: list[NvidiaPostTrainingJob]
 
 
 class NvidiaPostTrainingJobStatusResponse(PostTrainingJobStatusResponse):
@@ -83,11 +83,11 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
         self,
         method: str,
         path: str,
-        headers: Optional[Dict[str, Any]] = None,
-        params: Optional[Dict[str, Any]] = None,
-        json: Optional[Dict[str, Any]] = None,
+        headers: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
         **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         """Helper method to make HTTP requests to the Customizer API."""
         url = f"{self.customizer_url}{path}"
         request_headers = self.headers.copy()
@@ -109,9 +109,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
 
     async def get_training_jobs(
         self,
-        page: Optional[int] = 1,
-        page_size: Optional[int] = 10,
-        sort: Optional[Literal["created_at", "-created_at"]] = "created_at",
+        page: int | None = 1,
+        page_size: int | None = 10,
+        sort: Literal["created_at", "-created_at"] | None = "created_at",
     ) -> ListNvidiaPostTrainingJobs:
         """Get all customization jobs.
         Updated the base class return type from ListPostTrainingJobsResponse to ListNvidiaPostTrainingJobs.
@@ -207,12 +207,12 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
     async def supervised_fine_tune(
         self,
         job_uuid: str,
-        training_config: Dict[str, Any],
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        training_config: dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
         model: str,
-        checkpoint_dir: Optional[str],
-        algorithm_config: Optional[AlgorithmConfig] = None,
+        checkpoint_dir: str | None,
+        algorithm_config: AlgorithmConfig | None = None,
     ) -> NvidiaPostTrainingJob:
         """
         Fine-tunes a model on a dataset.
@@ -224,7 +224,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
 
         Parameters:
             training_config: TrainingConfig - Configuration for training
-            model: str - Model identifier
+            model: str - NeMo Customizer configuration name
             algorithm_config: Optional[AlgorithmConfig] - Algorithm-specific configuration
             checkpoint_dir: Optional[str] - Directory containing model checkpoints, ignored atm
             job_uuid: str - Unique identifier for the job, ignored atm
@@ -299,9 +299,6 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
 
             User is informed about unsupported parameters via warnings.
         """
-        # Map model to nvidia model name
-        # See `_MODEL_ENTRIES` for supported models
-        nvidia_model = self.get_provider_model_id(model)
 
         # Check for unsupported method parameters
         unsupported_method_params = []
@@ -347,7 +344,7 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
 
         # Prepare base job configuration
         job_config = {
-            "config": nvidia_model,
+            "config": model,
             "dataset": {
                 "name": training_config["data_config"]["dataset_id"],
                 "namespace": self.config.dataset_namespace,
@@ -423,8 +420,8 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
         finetuned_model: str,
         algorithm_config: DPOAlignmentConfig,
         training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
     ) -> PostTrainingJob:
         """Optimize a model based on preference data."""
         raise NotImplementedError("Preference optimization is not implemented yet")
diff --git a/llama_stack/providers/remote/post_training/nvidia/utils.py b/llama_stack/providers/remote/post_training/nvidia/utils.py
index ac47966af..d6e1016b2 100644
--- a/llama_stack/providers/remote/post_training/nvidia/utils.py
+++ b/llama_stack/providers/remote/post_training/nvidia/utils.py
@@ -6,7 +6,7 @@
 
 import logging
 import warnings
-from typing import Any, Dict, Set, Tuple
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -18,7 +18,7 @@ from .config import NvidiaPostTrainingConfig
 logger = logging.getLogger(__name__)
 
 
-def warn_unsupported_params(config_dict: Any, supported_keys: Set[str], config_name: str) -> None:
+def warn_unsupported_params(config_dict: Any, supported_keys: set[str], config_name: str) -> None:
     keys = set(config_dict.__annotations__.keys()) if isinstance(config_dict, BaseModel) else config_dict.keys()
     unsupported_params = [k for k in keys if k not in supported_keys]
     if unsupported_params:
@@ -28,7 +28,7 @@ def warn_unsupported_params(config_dict: Any, supported_keys: Set[str], config_n
 
 
 def validate_training_params(
-    training_config: Dict[str, Any], supported_keys: Set[str], config_name: str = "TrainingConfig"
+    training_config: dict[str, Any], supported_keys: set[str], config_name: str = "TrainingConfig"
 ) -> None:
     """
     Validates training parameters against supported keys.
@@ -57,7 +57,7 @@ def validate_training_params(
 
 
 # ToDo: implement post health checks for customizer are enabled
-async def _get_health(url: str) -> Tuple[bool, bool]: ...
+async def _get_health(url: str) -> tuple[bool, bool]: ...
 
 
 async def check_health(config: NvidiaPostTrainingConfig) -> None: ...
diff --git a/llama_stack/providers/remote/safety/bedrock/bedrock.py b/llama_stack/providers/remote/safety/bedrock/bedrock.py
index 2f960eead..c43b51073 100644
--- a/llama_stack/providers/remote/safety/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/safety/bedrock/bedrock.py
@@ -6,7 +6,7 @@
 
 import json
 import logging
-from typing import Any, Dict, List
+from typing import Any
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import (
@@ -53,7 +53,7 @@ class BedrockSafetyAdapter(Safety, ShieldsProtocolPrivate):
             )
 
     async def run_shield(
-        self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
+        self, shield_id: str, messages: list[Message], params: dict[str, Any] = None
     ) -> RunShieldResponse:
         shield = await self.shield_store.get_shield(shield_id)
         if not shield:
diff --git a/llama_stack/providers/remote/safety/nvidia/config.py b/llama_stack/providers/remote/safety/nvidia/config.py
index 3df80ed4f..4ca703a4d 100644
--- a/llama_stack/providers/remote/safety/nvidia/config.py
+++ b/llama_stack/providers/remote/safety/nvidia/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -27,10 +27,10 @@ class NVIDIASafetyConfig(BaseModel):
         default_factory=lambda: os.getenv("GUARDRAILS_SERVICE_URL", "http://0.0.0.0:7331"),
         description="The url for accessing the guardrails service",
     )
-    config_id: Optional[str] = Field(default="self-check", description="Config ID to use from the config store")
+    config_id: str | None = Field(default="self-check", description="Config ID to use from the config store")
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
         return {
             "guardrails_service_url": "${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}",
             "config_id": "self-check",
diff --git a/llama_stack/providers/remote/safety/nvidia/nvidia.py b/llama_stack/providers/remote/safety/nvidia/nvidia.py
index 1ff4a6ad9..411badb1c 100644
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@@ -5,15 +5,15 @@
 # the root directory of this source tree.
 
 import logging
-from typing import Any, List, Optional
+from typing import Any
 
 import requests
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import RunShieldResponse, Safety, SafetyViolation, ViolationLevel
 from llama_stack.apis.shields import Shield
-from llama_stack.distribution.library_client import convert_pydantic_to_json_value
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict_new
 
 from .config import NVIDIASafetyConfig
 
@@ -28,7 +28,6 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
         Args:
             config (NVIDIASafetyConfig): The configuration containing the guardrails service URL and config ID.
         """
-        print(f"Initializing NVIDIASafetyAdapter({config.guardrails_service_url})...")
         self.config = config
 
     async def initialize(self) -> None:
@@ -42,7 +41,7 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
             raise ValueError("Shield model not provided.")
 
     async def run_shield(
-        self, shield_id: str, messages: List[Message], params: Optional[dict[str, Any]] = None
+        self, shield_id: str, messages: list[Message], params: dict[str, Any] | None = None
     ) -> RunShieldResponse:
         """
         Run a safety shield check against the provided messages.
@@ -113,7 +112,7 @@ class NeMoGuardrails:
         response.raise_for_status()
         return response.json()
 
-    async def run(self, messages: List[Message]) -> RunShieldResponse:
+    async def run(self, messages: list[Message]) -> RunShieldResponse:
         """
         Queries the /v1/guardrails/checks endpoint of the NeMo guardrails deployed API.
 
@@ -127,9 +126,10 @@ class NeMoGuardrails:
         Raises:
             requests.HTTPError: If the POST request fails.
         """
+        request_messages = [await convert_message_to_openai_dict_new(message) for message in messages]
         request_data = {
             "model": self.model,
-            "messages": convert_pydantic_to_json_value(messages),
+            "messages": request_messages,
             "temperature": self.temperature,
             "top_p": 1,
             "frequency_penalty": 0,
diff --git a/llama_stack/providers/remote/safety/sambanova/__init__.py b/llama_stack/providers/remote/safety/sambanova/__init__.py
new file mode 100644
index 000000000..bb9d15374
--- /dev/null
+++ b/llama_stack/providers/remote/safety/sambanova/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any
+
+from .config import SambaNovaSafetyConfig
+
+
+async def get_adapter_impl(config: SambaNovaSafetyConfig, _deps) -> Any:
+    from .sambanova import SambaNovaSafetyAdapter
+
+    impl = SambaNovaSafetyAdapter(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/remote/safety/sambanova/config.py b/llama_stack/providers/remote/safety/sambanova/config.py
new file mode 100644
index 000000000..383cea244
--- /dev/null
+++ b/llama_stack/providers/remote/safety/sambanova/config.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field, SecretStr
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class SambaNovaProviderDataValidator(BaseModel):
+    sambanova_api_key: str | None = Field(
+        default=None,
+        description="Sambanova Cloud API key",
+    )
+
+
+@json_schema_type
+class SambaNovaSafetyConfig(BaseModel):
+    url: str = Field(
+        default="https://api.sambanova.ai/v1",
+        description="The URL for the SambaNova AI server",
+    )
+    api_key: SecretStr | None = Field(
+        default=None,
+        description="The SambaNova cloud API Key",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+        return {
+            "url": "https://api.sambanova.ai/v1",
+            "api_key": api_key,
+        }
diff --git a/llama_stack/providers/remote/safety/sambanova/sambanova.py b/llama_stack/providers/remote/safety/sambanova/sambanova.py
new file mode 100644
index 000000000..84c8267ae
--- /dev/null
+++ b/llama_stack/providers/remote/safety/sambanova/sambanova.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import logging
+from typing import Any
+
+import litellm
+import requests
+
+from llama_stack.apis.inference import Message
+from llama_stack.apis.safety import (
+    RunShieldResponse,
+    Safety,
+    SafetyViolation,
+    ViolationLevel,
+)
+from llama_stack.apis.shields import Shield
+from llama_stack.distribution.request_headers import NeedsRequestProviderData
+from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+from llama_stack.providers.utils.inference.openai_compat import convert_message_to_openai_dict_new
+
+from .config import SambaNovaSafetyConfig
+
+logger = logging.getLogger(__name__)
+
+CANNED_RESPONSE_TEXT = "I can't answer that. Can I help with something else?"
+
+
+class SambaNovaSafetyAdapter(Safety, ShieldsProtocolPrivate, NeedsRequestProviderData):
+    def __init__(self, config: SambaNovaSafetyConfig) -> None:
+        self.config = config
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    def _get_api_key(self) -> str:
+        config_api_key = self.config.api_key if self.config.api_key else None
+        if config_api_key:
+            return config_api_key.get_secret_value()
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.sambanova_api_key:
+                raise ValueError(
+                    'Pass Sambanova API Key in the header X-LlamaStack-Provider-Data as { "sambanova_api_key":  }'
+                )
+            return provider_data.sambanova_api_key
+
+    async def register_shield(self, shield: Shield) -> None:
+        list_models_url = self.config.url + "/models"
+        try:
+            response = requests.get(list_models_url)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            raise RuntimeError(f"Request to {list_models_url} failed") from e
+        available_models = [model.get("id") for model in response.json().get("data", {})]
+        if (
+            len(available_models) == 0
+            or "guard" not in shield.provider_resource_id.lower()
+            or shield.provider_resource_id.split("sambanova/")[-1] not in available_models
+        ):
+            raise ValueError(f"Shield {shield.provider_resource_id} not found in SambaNova")
+
+    async def run_shield(
+        self, shield_id: str, messages: list[Message], params: dict[str, Any] | None = None
+    ) -> RunShieldResponse:
+        shield = await self.shield_store.get_shield(shield_id)
+        if not shield:
+            raise ValueError(f"Shield {shield_id} not found")
+
+        shield_params = shield.params
+        logger.debug(f"run_shield::{shield_params}::messages={messages}")
+        content_messages = [await convert_message_to_openai_dict_new(m) for m in messages]
+        logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
+
+        response = litellm.completion(
+            model=shield.provider_resource_id, messages=content_messages, api_key=self._get_api_key()
+        )
+        shield_message = response.choices[0].message.content
+
+        if "unsafe" in shield_message.lower():
+            user_message = CANNED_RESPONSE_TEXT
+            violation_type = shield_message.split("\n")[-1]
+            metadata = {"violation_type": violation_type}
+
+            return RunShieldResponse(
+                violation=SafetyViolation(
+                    user_message=user_message,
+                    violation_level=ViolationLevel.ERROR,
+                    metadata=metadata,
+                )
+            )
+
+        return RunShieldResponse()
diff --git a/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py b/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
index b34c9fd9d..7e82cb6d4 100644
--- a/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
@@ -5,26 +5,26 @@
 # the root directory of this source tree.
 
 import json
-from typing import Any, Dict, Optional
+from typing import Any
 
 import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
     ListToolDefsResponse,
-    Tool,
     ToolDef,
+    ToolGroup,
     ToolInvocationResult,
     ToolParameter,
     ToolRuntime,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 
 from .config import BingSearchToolConfig
 
 
-class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
+class BingSearchToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
     def __init__(self, config: BingSearchToolConfig):
         self.config = config
         self.url = "https://api.bing.microsoft.com/v7.0/search"
@@ -32,10 +32,10 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool) -> None:
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
         pass
 
-    async def unregister_tool(self, tool_id: str) -> None:
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         return
 
     def _get_api_key(self) -> str:
@@ -50,7 +50,7 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
         return provider_data.bing_search_api_key
 
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
         return ListToolDefsResponse(
             data=[
@@ -68,7 +68,7 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
             ]
         )
 
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         api_key = self._get_api_key()
         headers = {
             "Ocp-Apim-Subscription-Key": api_key,
diff --git a/llama_stack/providers/remote/tool_runtime/bing_search/config.py b/llama_stack/providers/remote/tool_runtime/bing_search/config.py
index 4f089439f..30269dbc1 100644
--- a/llama_stack/providers/remote/tool_runtime/bing_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -12,11 +12,11 @@ from pydantic import BaseModel
 class BingSearchToolConfig(BaseModel):
     """Configuration for Bing Search Tool Runtime"""
 
-    api_key: Optional[str] = None
+    api_key: str | None = None
     top_k: int = 3
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "api_key": "${env.BING_API_KEY:}",
         }
diff --git a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
index 41f3ce823..b96b9e59c 100644
--- a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
@@ -4,37 +4,37 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
     ListToolDefsResponse,
-    Tool,
     ToolDef,
+    ToolGroup,
     ToolInvocationResult,
     ToolParameter,
     ToolRuntime,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.models.llama.datatypes import BuiltinTool
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 
 from .config import BraveSearchToolConfig
 
 
-class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
+class BraveSearchToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
     def __init__(self, config: BraveSearchToolConfig):
         self.config = config
 
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool) -> None:
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
         pass
 
-    async def unregister_tool(self, tool_id: str) -> None:
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         return
 
     def _get_api_key(self) -> str:
@@ -49,7 +49,7 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
         return provider_data.brave_search_api_key
 
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
         return ListToolDefsResponse(
             data=[
@@ -68,7 +68,7 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
             ]
         )
 
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         api_key = self._get_api_key()
         url = "https://api.search.brave.com/res/v1/web/search"
         headers = {
diff --git a/llama_stack/providers/remote/tool_runtime/brave_search/config.py b/llama_stack/providers/remote/tool_runtime/brave_search/config.py
index ab6053609..37ba21304 100644
--- a/llama_stack/providers/remote/tool_runtime/brave_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/config.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
 
 class BraveSearchToolConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Brave Search API Key",
     )
@@ -20,7 +20,7 @@ class BraveSearchToolConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
         return {
             "api_key": "${env.BRAVE_SEARCH_API_KEY:}",
             "max_results": 3,
diff --git a/llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py b/llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py
index fb1f558e5..051a880a7 100644
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py
@@ -4,18 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from pydantic import BaseModel
-
-from .config import ModelContextProtocolConfig
+from .config import MCPProviderConfig
 
 
-class ModelContextProtocolToolProviderDataValidator(BaseModel):
-    api_key: str
-
-
-async def get_adapter_impl(config: ModelContextProtocolConfig, _deps):
+async def get_adapter_impl(config: MCPProviderConfig, _deps):
     from .model_context_protocol import ModelContextProtocolToolRuntimeImpl
 
-    impl = ModelContextProtocolToolRuntimeImpl(config)
+    impl = ModelContextProtocolToolRuntimeImpl(config, _deps)
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
index 30ac407bc..b8c5e77fd 100644
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
@@ -4,12 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
 
-class ModelContextProtocolConfig(BaseModel):
+class MCPProviderDataValidator(BaseModel):
+    # mcp_endpoint => dict of headers to send
+    mcp_headers: dict[str, dict[str, str]] | None = None
+
+
+class MCPProviderConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py b/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py
index 676917225..a9b252dfe 100644
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py
@@ -4,66 +4,50 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 from urllib.parse import urlparse
 
-from mcp import ClientSession
-from mcp.client.sse import sse_client
-
 from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.datatypes import Api
 from llama_stack.apis.tools import (
     ListToolDefsResponse,
-    ToolDef,
+    ToolGroup,
     ToolInvocationResult,
-    ToolParameter,
     ToolRuntime,
 )
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.distribution.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
+from llama_stack.providers.utils.tools.mcp import invoke_mcp_tool, list_mcp_tools
 
-from .config import ModelContextProtocolConfig
+from .config import MCPProviderConfig
+
+logger = get_logger(__name__, category="tools")
 
 
-class ModelContextProtocolToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
-    def __init__(self, config: ModelContextProtocolConfig):
+class ModelContextProtocolToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
+    def __init__(self, config: MCPProviderConfig, _deps: dict[Api, Any]):
         self.config = config
 
     async def initialize(self):
         pass
 
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
+        pass
+
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
+        return
+
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
+        # this endpoint should be retrieved by getting the tool group right?
         if mcp_endpoint is None:
             raise ValueError("mcp_endpoint is required")
+        headers = await self.get_headers_from_request(mcp_endpoint.uri)
+        return await list_mcp_tools(mcp_endpoint.uri, headers)
 
-        tools = []
-        async with sse_client(mcp_endpoint.uri) as streams:
-            async with ClientSession(*streams) as session:
-                await session.initialize()
-                tools_result = await session.list_tools()
-                for tool in tools_result.tools:
-                    parameters = []
-                    for param_name, param_schema in tool.inputSchema.get("properties", {}).items():
-                        parameters.append(
-                            ToolParameter(
-                                name=param_name,
-                                parameter_type=param_schema.get("type", "string"),
-                                description=param_schema.get("description", ""),
-                            )
-                        )
-                    tools.append(
-                        ToolDef(
-                            name=tool.name,
-                            description=tool.description,
-                            parameters=parameters,
-                            metadata={
-                                "endpoint": mcp_endpoint.uri,
-                            },
-                        )
-                    )
-        return ListToolDefsResponse(data=tools)
-
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         tool = await self.tool_store.get_tool(tool_name)
         if tool.metadata is None or tool.metadata.get("endpoint") is None:
             raise ValueError(f"Tool {tool_name} does not have metadata")
@@ -71,12 +55,19 @@ class ModelContextProtocolToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
         if urlparse(endpoint).scheme not in ("http", "https"):
             raise ValueError(f"Endpoint {endpoint} is not a valid HTTP(S) URL")
 
-        async with sse_client(endpoint) as streams:
-            async with ClientSession(*streams) as session:
-                await session.initialize()
-                result = await session.call_tool(tool.identifier, kwargs)
+        headers = await self.get_headers_from_request(endpoint)
+        return await invoke_mcp_tool(endpoint, headers, tool_name, kwargs)
 
-        return ToolInvocationResult(
-            content="\n".join([result.model_dump_json() for result in result.content]),
-            error_code=1 if result.isError else 0,
-        )
+    async def get_headers_from_request(self, mcp_endpoint_uri: str) -> dict[str, str]:
+        def canonicalize_uri(uri: str) -> str:
+            return f"{urlparse(uri).netloc or ''}/{urlparse(uri).path or ''}"
+
+        headers = {}
+
+        provider_data = self.get_request_provider_data()
+        if provider_data and provider_data.mcp_headers:
+            for uri, values in provider_data.mcp_headers.items():
+                if canonicalize_uri(uri) != canonicalize_uri(mcp_endpoint_uri):
+                    continue
+                headers.update(values)
+        return headers
diff --git a/llama_stack/providers/remote/tool_runtime/tavily_search/config.py b/llama_stack/providers/remote/tool_runtime/tavily_search/config.py
index 945430bb1..c9b18d30d 100644
--- a/llama_stack/providers/remote/tool_runtime/tavily_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/tavily_search/config.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
 
 class TavilySearchToolConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
         default=None,
         description="The Tavily Search API Key",
     )
@@ -20,7 +20,7 @@ class TavilySearchToolConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
         return {
             "api_key": "${env.TAVILY_SEARCH_API_KEY:}",
             "max_results": 3,
diff --git a/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py b/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
index 719d6be14..1fe91fd7f 100644
--- a/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
+++ b/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
@@ -5,36 +5,36 @@
 # the root directory of this source tree.
 
 import json
-from typing import Any, Dict, Optional
+from typing import Any
 
 import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
     ListToolDefsResponse,
-    Tool,
     ToolDef,
+    ToolGroup,
     ToolInvocationResult,
     ToolParameter,
     ToolRuntime,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 
 from .config import TavilySearchToolConfig
 
 
-class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
+class TavilySearchToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
     def __init__(self, config: TavilySearchToolConfig):
         self.config = config
 
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool) -> None:
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
         pass
 
-    async def unregister_tool(self, tool_id: str) -> None:
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         return
 
     def _get_api_key(self) -> str:
@@ -49,7 +49,7 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
         return provider_data.tavily_search_api_key
 
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
         return ListToolDefsResponse(
             data=[
@@ -67,7 +67,7 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
             ]
         )
 
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         api_key = self._get_api_key()
         async with httpx.AsyncClient() as client:
             response = await client.post(
diff --git a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
index 8ea49c7b5..aefc86bd6 100644
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -12,10 +12,10 @@ from pydantic import BaseModel
 class WolframAlphaToolConfig(BaseModel):
     """Configuration for WolframAlpha Tool Runtime"""
 
-    api_key: Optional[str] = None
+    api_key: str | None = None
 
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {
             "api_key": "${env.WOLFRAM_ALPHA_API_KEY:}",
         }
diff --git a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
index b3e0e120c..6e1d0f61d 100644
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
@@ -5,26 +5,26 @@
 # the root directory of this source tree.
 
 import json
-from typing import Any, Dict, Optional
+from typing import Any
 
 import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
     ListToolDefsResponse,
-    Tool,
     ToolDef,
+    ToolGroup,
     ToolInvocationResult,
     ToolParameter,
     ToolRuntime,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
-from llama_stack.providers.datatypes import ToolsProtocolPrivate
+from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 
 from .config import WolframAlphaToolConfig
 
 
-class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
+class WolframAlphaToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, NeedsRequestProviderData):
     def __init__(self, config: WolframAlphaToolConfig):
         self.config = config
         self.url = "https://api.wolframalpha.com/v2/query"
@@ -32,10 +32,10 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool) -> None:
+    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
         pass
 
-    async def unregister_tool(self, tool_id: str) -> None:
+    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
         return
 
     def _get_api_key(self) -> str:
@@ -50,7 +50,7 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
         return provider_data.wolfram_alpha_api_key
 
     async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
     ) -> ListToolDefsResponse:
         return ListToolDefsResponse(
             data=[
@@ -68,7 +68,7 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
             ]
         )
 
-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
         api_key = self._get_api_key()
         params = {
             "input": kwargs["query"],
diff --git a/llama_stack/providers/remote/vector_io/chroma/__init__.py b/llama_stack/providers/remote/vector_io/chroma/__init__.py
index 8646b04d6..ebbc62b1c 100644
--- a/llama_stack/providers/remote/vector_io/chroma/__init__.py
+++ b/llama_stack/providers/remote/vector_io/chroma/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
 from .config import ChromaVectorIOConfig
 
 
-async def get_adapter_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: ChromaVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from .chroma import ChromaVectorIOAdapter
 
     impl = ChromaVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py
index 3bf3a7740..a59a38573 100644
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@@ -6,7 +6,7 @@
 import asyncio
 import json
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 from urllib.parse import urlparse
 
 import chromadb
@@ -26,8 +26,7 @@ from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
 
 log = logging.getLogger(__name__)
 
-
-ChromaClientType = Union[chromadb.AsyncHttpClient, chromadb.PersistentClient]
+ChromaClientType = chromadb.api.AsyncClientAPI | chromadb.api.ClientAPI
 
 
 # this is a helper to allow us to use async and non-async chroma clients interchangeably
@@ -42,7 +41,7 @@ class ChromaIndex(EmbeddingIndex):
         self.client = client
         self.collection = collection
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         assert len(chunks) == len(embeddings), (
             f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
         )
@@ -85,11 +84,19 @@ class ChromaIndex(EmbeddingIndex):
     async def delete(self):
         await maybe_await(self.client.delete_collection(self.collection.name))
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in Chroma")
+
 
 class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(
         self,
-        config: Union[RemoteChromaVectorIOConfig, InlineChromaVectorIOConfig],
+        config: RemoteChromaVectorIOConfig | InlineChromaVectorIOConfig,
         inference_api: Api.inference,
     ) -> None:
         log.info(f"Initializing ChromaVectorIOAdapter with url: {config}")
@@ -137,8 +144,8 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
 
@@ -148,7 +155,7 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
 
diff --git a/llama_stack/providers/remote/vector_io/chroma/config.py b/llama_stack/providers/remote/vector_io/chroma/config.py
index 3e2463252..4e893fab4 100644
--- a/llama_stack/providers/remote/vector_io/chroma/config.py
+++ b/llama_stack/providers/remote/vector_io/chroma/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
     url: str
 
     @classmethod
-    def sample_run_config(cls, url: str = "${env.CHROMADB_URL}", **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, url: str = "${env.CHROMADB_URL}", **kwargs: Any) -> dict[str, Any]:
         return {"url": url}
diff --git a/llama_stack/providers/remote/vector_io/milvus/__init__.py b/llama_stack/providers/remote/vector_io/milvus/__init__.py
index 84cb1d748..92dbfda2e 100644
--- a/llama_stack/providers/remote/vector_io/milvus/__init__.py
+++ b/llama_stack/providers/remote/vector_io/milvus/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
 from .config import MilvusVectorIOConfig
 
 
-async def get_adapter_impl(config: MilvusVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: MilvusVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from .milvus import MilvusVectorIOAdapter
 
     assert isinstance(config, MilvusVectorIOConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/remote/vector_io/milvus/config.py b/llama_stack/providers/remote/vector_io/milvus/config.py
index 17da6b23d..9bdc7ed5c 100644
--- a/llama_stack/providers/remote/vector_io/milvus/config.py
+++ b/llama_stack/providers/remote/vector_io/milvus/config.py
@@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 from llama_stack.schema_utils import json_schema_type
 
@@ -14,9 +14,11 @@ from llama_stack.schema_utils import json_schema_type
 @json_schema_type
 class MilvusVectorIOConfig(BaseModel):
     uri: str
-    token: Optional[str] = None
+    token: str | None = None
     consistency_level: str = "Strong"
 
+    model_config = ConfigDict(extra="allow")
+
     @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
         return {"uri": "${env.MILVUS_ENDPOINT}", "token": "${env.MILVUS_TOKEN}"}
diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py
index 1949d293d..6628292db 100644
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@@ -9,7 +9,7 @@ import hashlib
 import logging
 import os
 import uuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 from numpy.typing import NDArray
 from pymilvus import MilvusClient
@@ -39,7 +39,7 @@ class MilvusIndex(EmbeddingIndex):
         if await asyncio.to_thread(self.client.has_collection, self.collection_name):
             await asyncio.to_thread(self.client.drop_collection, collection_name=self.collection_name)
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         assert len(chunks) == len(embeddings), (
             f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
         )
@@ -73,7 +73,7 @@ class MilvusIndex(EmbeddingIndex):
             logger.error(f"Error inserting chunks into Milvus collection {self.collection_name}: {e}")
             raise e
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
         search_res = await asyncio.to_thread(
             self.client.search,
             collection_name=self.collection_name,
@@ -86,10 +86,18 @@ class MilvusIndex(EmbeddingIndex):
         scores = [res["distance"] for res in search_res[0]]
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in Milvus")
+
 
 class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(
-        self, config: Union[RemoteMilvusVectorIOConfig, InlineMilvusVectorIOConfig], inference_api: Api.inference
+        self, config: RemoteMilvusVectorIOConfig | InlineMilvusVectorIOConfig, inference_api: Api.inference
     ) -> None:
         self.config = config
         self.cache = {}
@@ -124,7 +132,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
 
         self.cache[vector_db.identifier] = index
 
-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
         if vector_db_id in self.cache:
             return self.cache[vector_db_id]
 
@@ -148,8 +156,8 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
@@ -161,7 +169,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
@@ -172,7 +180,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
 
 def generate_chunk_id(document_id: str, chunk_text: str) -> str:
     """Generate a unique chunk ID using a hash of document ID and chunk text."""
-    hash_input = f"{document_id}:{chunk_text}".encode("utf-8")
+    hash_input = f"{document_id}:{chunk_text}".encode()
     return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))
 
 
diff --git a/llama_stack/providers/remote/vector_io/pgvector/__init__.py b/llama_stack/providers/remote/vector_io/pgvector/__init__.py
index 089d890b7..9f528db74 100644
--- a/llama_stack/providers/remote/vector_io/pgvector/__init__.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
 from .config import PGVectorVectorIOConfig
 
 
-async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from .pgvector import PGVectorVectorIOAdapter
 
     impl = PGVectorVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/remote/vector_io/pgvector/config.py b/llama_stack/providers/remote/vector_io/pgvector/config.py
index e9eb0f12d..04b92a2e4 100644
--- a/llama_stack/providers/remote/vector_io/pgvector/config.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -28,5 +28,5 @@ class PGVectorVectorIOConfig(BaseModel):
         user: str = "${env.PGVECTOR_USER}",
         password: str = "${env.PGVECTOR_PASSWORD}",
         **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         return {"host": host, "port": port, "db": db, "user": user, "password": password}
diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
index 7c683e126..ea918c552 100644
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import psycopg2
 from numpy.typing import NDArray
@@ -33,7 +33,7 @@ def check_extension_version(cur):
     return result[0] if result else None
 
 
-def upsert_models(conn, keys_models: List[Tuple[str, BaseModel]]):
+def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
     with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
         query = sql.SQL(
             """
@@ -74,7 +74,7 @@ class PGVectorIndex(EmbeddingIndex):
             """
             )
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         assert len(chunks) == len(embeddings), (
             f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
         )
@@ -99,7 +99,7 @@ class PGVectorIndex(EmbeddingIndex):
         with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
             execute_values(cur, query, values, template="(%s, %s, %s::vector)")
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
         with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
             cur.execute(
                 f"""
@@ -120,6 +120,14 @@ class PGVectorIndex(EmbeddingIndex):
 
             return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in PGVector")
+
     async def delete(self):
         with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
             cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
@@ -180,8 +188,8 @@ class PGVectorVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         await index.insert_chunks(chunks)
@@ -190,7 +198,7 @@ class PGVectorVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         return await index.query_chunks(query, params)
diff --git a/llama_stack/providers/remote/vector_io/qdrant/__init__.py b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
index f5bb7f84c..029de285f 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/__init__.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
 from .config import QdrantVectorIOConfig
 
 
-async def get_adapter_impl(config: QdrantVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: QdrantVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from .qdrant import QdrantVectorIOAdapter
 
     impl = QdrantVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/remote/vector_io/qdrant/config.py b/llama_stack/providers/remote/vector_io/qdrant/config.py
index 6d7eebe23..314d3f5f1 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, Optional
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -13,19 +13,19 @@ from llama_stack.schema_utils import json_schema_type
 
 @json_schema_type
 class QdrantVectorIOConfig(BaseModel):
-    location: Optional[str] = None
-    url: Optional[str] = None
-    port: Optional[int] = 6333
+    location: str | None = None
+    url: str | None = None
+    port: int | None = 6333
     grpc_port: int = 6334
     prefer_grpc: bool = False
-    https: Optional[bool] = None
-    api_key: Optional[str] = None
-    prefix: Optional[str] = None
-    timeout: Optional[int] = None
-    host: Optional[str] = None
+    https: bool | None = None
+    api_key: str | None = None
+    prefix: str | None = None
+    timeout: int | None = None
+    host: str | None = None
 
     @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
         return {
             "api_key": "${env.QDRANT_API_KEY}",
         }
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 9e7788dc0..ff0690083 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -6,7 +6,7 @@
 
 import logging
 import uuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 from numpy.typing import NDArray
 from qdrant_client import AsyncQdrantClient, models
@@ -44,7 +44,7 @@ class QdrantIndex(EmbeddingIndex):
         self.client = client
         self.collection_name = collection_name
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         assert len(chunks) == len(embeddings), (
             f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
         )
@@ -68,7 +68,7 @@ class QdrantIndex(EmbeddingIndex):
 
         await self.client.upsert(collection_name=self.collection_name, points=points)
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
         results = (
             await self.client.query_points(
                 collection_name=self.collection_name,
@@ -95,13 +95,21 @@ class QdrantIndex(EmbeddingIndex):
 
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in Qdrant")
+
     async def delete(self):
         await self.client.delete_collection(collection_name=self.collection_name)
 
 
 class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(
-        self, config: Union[RemoteQdrantVectorIOConfig, InlineQdrantVectorIOConfig], inference_api: Api.inference
+        self, config: RemoteQdrantVectorIOConfig | InlineQdrantVectorIOConfig, inference_api: Api.inference
     ) -> None:
         self.config = config
         self.client: AsyncQdrantClient = None
@@ -131,7 +139,7 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
             await self.cache[vector_db_id].index.delete()
             del self.cache[vector_db_id]
 
-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
         if vector_db_id in self.cache:
             return self.cache[vector_db_id]
 
@@ -150,8 +158,8 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
@@ -163,7 +171,7 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
diff --git a/llama_stack/providers/remote/vector_io/weaviate/__init__.py b/llama_stack/providers/remote/vector_io/weaviate/__init__.py
index c93c628d8..22e116c22 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/__init__.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/__init__.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec
 
-from .config import WeaviateRequestProviderData, WeaviateVectorIOConfig  # noqa: F401
+from .config import WeaviateVectorIOConfig
 
 
-async def get_adapter_impl(config: WeaviateVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: WeaviateVectorIOConfig, deps: dict[Api, ProviderSpec]):
     from .weaviate import WeaviateVectorIOAdapter
 
     impl = WeaviateVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/remote/vector_io/weaviate/config.py b/llama_stack/providers/remote/vector_io/weaviate/config.py
index cc587f252..a8c6e3e2c 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/config.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict
+from typing import Any
 
 from pydantic import BaseModel
 
@@ -16,5 +16,5 @@ class WeaviateRequestProviderData(BaseModel):
 
 class WeaviateVectorIOConfig(BaseModel):
     @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
         return {}
diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
index 52aa2f3a3..e6fe8ccd3 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import weaviate
 import weaviate.classes as wvc
@@ -33,7 +33,7 @@ class WeaviateIndex(EmbeddingIndex):
         self.client = client
         self.collection_name = collection_name
 
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         assert len(chunks) == len(embeddings), (
             f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
         )
@@ -55,7 +55,7 @@ class WeaviateIndex(EmbeddingIndex):
         # TODO: make this async friendly
         collection.data.insert_many(data_objects)
 
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
         collection = self.client.collections.get(self.collection_name)
 
         results = collection.query.near_vector(
@@ -80,10 +80,18 @@ class WeaviateIndex(EmbeddingIndex):
 
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
-    async def delete(self, chunk_ids: List[str]) -> None:
+    async def delete(self, chunk_ids: list[str]) -> None:
         collection = self.client.collections.get(self.collection_name)
         collection.data.delete_many(where=Filter.by_property("id").contains_any(chunk_ids))
 
+    async def query_keyword(
+        self,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Keyword search is not supported in Weaviate")
+
 
 class WeaviateVectorIOAdapter(
     VectorIO,
@@ -144,7 +152,7 @@ class WeaviateVectorIOAdapter(
             self.inference_api,
         )
 
-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
         if vector_db_id in self.cache:
             return self.cache[vector_db_id]
 
@@ -167,8 +175,8 @@ class WeaviateVectorIOAdapter(
     async def insert_chunks(
         self,
         vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
     ) -> None:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
@@ -180,7 +188,7 @@ class WeaviateVectorIOAdapter(
         self,
         vector_db_id: str,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         index = await self._get_and_cache_vector_db_index(vector_db_id)
         if not index:
diff --git a/llama_stack/providers/tests/ci_test_config.yaml b/llama_stack/providers/tests/ci_test_config.yaml
deleted file mode 100644
index 3edcd38bf..000000000
--- a/llama_stack/providers/tests/ci_test_config.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-inference:
-  tests:
-  - inference/test_vision_inference.py::test_vision_chat_completion_streaming
-  - inference/test_vision_inference.py::test_vision_chat_completion_non_streaming
-  - inference/test_text_inference.py::test_structured_output
-  - inference/test_text_inference.py::test_chat_completion_streaming
-  - inference/test_text_inference.py::test_chat_completion_non_streaming
-  - inference/test_text_inference.py::test_chat_completion_with_tool_calling
-  - inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming
-
-  scenarios:
-  - provider_fixtures:
-      inference: ollama
-  - fixture_combo_id: fireworks
-  - provider_fixtures:
-      inference: together
-    # - inference: tgi
-    # - inference: vllm_remote
-
-  inference_models:
-  - meta-llama/Llama-3.1-8B-Instruct
-  - meta-llama/Llama-3.2-11B-Vision-Instruct
-
-
-agents:
-  tests:
-   - agents/test_agents.py::test_agent_turns_with_safety
-   - agents/test_agents.py::test_rag_agent
-
-  scenarios:
-  - fixture_combo_id: ollama
-  - fixture_combo_id: together
-  - fixture_combo_id: fireworks
-
-  inference_models:
-  - meta-llama/Llama-3.2-1B-Instruct
-
-  safety_shield: meta-llama/Llama-Guard-3-1B
-
-
-memory:
-  tests:
-   - memory/test_memory.py::test_query_documents
-
-  scenarios:
-  - fixture_combo_id: ollama
-  - provider_fixtures:
-      inference: sentence_transformers
-      memory: faiss
-  - fixture_combo_id: chroma
-
-  inference_models:
-  - meta-llama/Llama-3.2-1B-Instruct
-
-  embedding_model: all-MiniLM-L6-v2
diff --git a/llama_stack/providers/tests/conftest.py b/llama_stack/providers/tests/conftest.py
deleted file mode 100644
index d3e715b7e..000000000
--- a/llama_stack/providers/tests/conftest.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import pytest
-import yaml
-from dotenv import load_dotenv
-from pydantic import BaseModel, Field
-from termcolor import colored
-
-from llama_stack.distribution.datatypes import Provider
-from llama_stack.providers.datatypes import RemoteProviderConfig
-
-from .env import get_env_or_fail
-from .report import Report
-
-
-class ProviderFixture(BaseModel):
-    providers: List[Provider]
-    provider_data: Optional[Dict[str, Any]] = None
-
-
-class TestScenario(BaseModel):
-    # provider fixtures can be either a mark or a dictionary of api -> providers
-    provider_fixtures: Dict[str, str] = Field(default_factory=dict)
-    fixture_combo_id: Optional[str] = None
-
-
-class APITestConfig(BaseModel):
-    scenarios: List[TestScenario] = Field(default_factory=list)
-    inference_models: List[str] = Field(default_factory=list)
-
-    # test name format should be ::
-    tests: List[str] = Field(default_factory=list)
-
-
-class MemoryApiTestConfig(APITestConfig):
-    embedding_model: Optional[str] = Field(default_factory=None)
-
-
-class AgentsApiTestConfig(APITestConfig):
-    safety_shield: Optional[str] = Field(default_factory=None)
-
-
-class TestConfig(BaseModel):
-    inference: Optional[APITestConfig] = None
-    agents: Optional[AgentsApiTestConfig] = None
-    memory: Optional[MemoryApiTestConfig] = None
-
-
-def get_test_config_from_config_file(metafunc_config):
-    config_file = metafunc_config.getoption("--config")
-    if config_file is None:
-        return None
-
-    config_file_path = Path(__file__).parent / config_file
-    if not config_file_path.exists():
-        raise ValueError(
-            f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory."
-        )
-    with open(config_file_path, "r") as config_file:
-        config = yaml.safe_load(config_file)
-        return TestConfig(**config)
-
-
-def get_test_config_for_api(metafunc_config, api):
-    test_config = get_test_config_from_config_file(metafunc_config)
-    if test_config is None:
-        return None
-    return getattr(test_config, api)
-
-
-def get_provider_fixture_overrides_from_test_config(metafunc_config, api, default_provider_fixture_combinations):
-    api_config = get_test_config_for_api(metafunc_config, api)
-    if api_config is None:
-        return None
-
-    fixture_combo_ids = set()
-    custom_provider_fixture_combos = []
-    for scenario in api_config.scenarios:
-        if scenario.fixture_combo_id:
-            fixture_combo_ids.add(scenario.fixture_combo_id)
-        else:
-            custom_provider_fixture_combos.append(
-                pytest.param(
-                    scenario.provider_fixtures,
-                    id=scenario.provider_fixtures.get("inference") or "",
-                )
-            )
-
-    if len(fixture_combo_ids) > 0:
-        for default_fixture in default_provider_fixture_combinations:
-            if default_fixture.id in fixture_combo_ids:
-                custom_provider_fixture_combos.append(default_fixture)
-    return custom_provider_fixture_combos
-
-
-def remote_stack_fixture() -> ProviderFixture:
-    if url := os.getenv("REMOTE_STACK_URL", None):
-        config = RemoteProviderConfig.from_url(url)
-    else:
-        config = RemoteProviderConfig(
-            host=get_env_or_fail("REMOTE_STACK_HOST"),
-            port=int(get_env_or_fail("REMOTE_STACK_PORT")),
-        )
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="test::remote",
-                provider_type="test::remote",
-                config=config.model_dump(),
-            )
-        ],
-    )
-
-
-def pytest_configure(config):
-    config.option.tbstyle = "short"
-    config.option.disable_warnings = True
-
-    """Load environment variables at start of test run"""
-    # Load from .env file if it exists
-    env_file = Path(__file__).parent / ".env"
-    if env_file.exists():
-        load_dotenv(env_file)
-
-    # Load any environment variables passed via --env
-    env_vars = config.getoption("--env") or []
-    for env_var in env_vars:
-        key, value = env_var.split("=", 1)
-        os.environ[key] = value
-
-    if config.getoption("--output") is not None:
-        config.pluginmanager.register(Report(config.getoption("--output")))
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--providers",
-        default="",
-        help=(
-            "Provider configuration in format: api1=provider1,api2=provider2. "
-            "Example: --providers inference=ollama,safety=meta-reference"
-        ),
-    )
-    parser.addoption(
-        "--config",
-        action="store",
-        help="Set test config file (supported format: YAML), e.g. --config=test_config.yml",
-    )
-    parser.addoption(
-        "--output",
-        action="store",
-        help="Set output file for test report, e.g. --output=pytest_report.md",
-    )
-    """Add custom command line options"""
-    parser.addoption("--env", action="append", help="Set environment variables, e.g. --env KEY=value")
-    parser.addoption(
-        "--inference-model",
-        action="store",
-        default="meta-llama/Llama-3.2-3B-Instruct",
-        help="Specify the inference model to use for testing",
-    )
-    parser.addoption(
-        "--safety-shield",
-        action="store",
-        default="meta-llama/Llama-Guard-3-1B",
-        help="Specify the safety shield to use for testing",
-    )
-    parser.addoption(
-        "--embedding-model",
-        action="store",
-        default=None,
-        help="Specify the embedding model to use for testing",
-    )
-    parser.addoption(
-        "--judge-model",
-        action="store",
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        help="Specify the judge model to use for testing",
-    )
-
-
-def make_provider_id(providers: Dict[str, str]) -> str:
-    return ":".join(f"{api}={provider}" for api, provider in sorted(providers.items()))
-
-
-def get_provider_marks(providers: Dict[str, str]) -> List[Any]:
-    marks = []
-    for provider in providers.values():
-        marks.append(getattr(pytest.mark, provider))
-    return marks
-
-
-def get_provider_fixture_overrides(config, available_fixtures: Dict[str, List[str]]) -> Optional[List[pytest.param]]:
-    provider_str = config.getoption("--providers")
-    if not provider_str:
-        return None
-
-    fixture_dict = parse_fixture_string(provider_str, available_fixtures)
-    return [
-        pytest.param(
-            fixture_dict,
-            id=make_provider_id(fixture_dict),
-            marks=get_provider_marks(fixture_dict),
-        )
-    ]
-
-
-def parse_fixture_string(provider_str: str, available_fixtures: Dict[str, List[str]]) -> Dict[str, str]:
-    """Parse provider string of format 'api1=provider1,api2=provider2'"""
-    if not provider_str:
-        return {}
-
-    fixtures = {}
-    pairs = provider_str.split(",")
-    for pair in pairs:
-        if "=" not in pair:
-            raise ValueError(f"Invalid provider specification: {pair}. Expected format: api=provider")
-        api, fixture = pair.split("=")
-        if api not in available_fixtures:
-            raise ValueError(f"Unknown API: {api}. Available APIs: {list(available_fixtures.keys())}")
-        if fixture not in available_fixtures[api]:
-            raise ValueError(
-                f"Unknown provider '{fixture}' for API '{api}'. Available providers: {list(available_fixtures[api])}"
-            )
-        fixtures[api] = fixture
-
-    # Check that all provided APIs are supported
-    for api in available_fixtures.keys():
-        if api not in fixtures:
-            raise ValueError(
-                f"Missing provider fixture for API '{api}'. Available providers: {list(available_fixtures[api])}"
-            )
-    return fixtures
-
-
-def pytest_itemcollected(item):
-    # Get all markers as a list
-    filtered = ("asyncio", "parametrize")
-    marks = [mark.name for mark in item.iter_markers() if mark.name not in filtered]
-    if marks:
-        marks = colored(",".join(marks), "yellow")
-        item.name = f"{item.name}[{marks}]"
-
-
-def pytest_collection_modifyitems(session, config, items):
-    test_config = get_test_config_from_config_file(config)
-    if test_config is None:
-        return
-
-    required_tests = defaultdict(set)
-    for api_test_config in [
-        test_config.inference,
-        test_config.memory,
-        test_config.agents,
-    ]:
-        if api_test_config is None:
-            continue
-        for test in api_test_config.tests:
-            arr = test.split("::")
-            if len(arr) != 2:
-                raise ValueError(f"Invalid format for test name {test}")
-            test_path, func_name = arr
-            required_tests[Path(__file__).parent / test_path].add(func_name)
-
-    new_items, deselected_items = [], []
-    for item in items:
-        func_name = getattr(item, "originalname", item.name)
-        if func_name in required_tests[item.fspath]:
-            new_items.append(item)
-            continue
-        deselected_items.append(item)
-
-    items[:] = new_items
-    config.hook.pytest_deselected(items=deselected_items)
-
-
-pytest_plugins = [
-    "llama_stack.providers.tests.inference.fixtures",
-    "llama_stack.providers.tests.safety.fixtures",
-    "llama_stack.providers.tests.vector_io.fixtures",
-    "llama_stack.providers.tests.agents.fixtures",
-    "llama_stack.providers.tests.datasetio.fixtures",
-    "llama_stack.providers.tests.scoring.fixtures",
-    "llama_stack.providers.tests.eval.fixtures",
-    "llama_stack.providers.tests.post_training.fixtures",
-    "llama_stack.providers.tests.tools.fixtures",
-]
diff --git a/llama_stack/providers/tests/report.py b/llama_stack/providers/tests/report.py
deleted file mode 100644
index bc29534be..000000000
--- a/llama_stack/providers/tests/report.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from collections import defaultdict
-from pathlib import Path
-
-import pytest
-from pytest import ExitCode
-from pytest_html.basereport import _process_outcome
-
-from llama_stack.models.llama.sku_list import all_registered_models
-from llama_stack.models.llama.sku_types import CoreModelId
-
-INFERENCE_APIS = ["chat_completion"]
-FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
-SUPPORTED_MODELS = {
-    "ollama": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_1b.value,
-    },
-    "fireworks": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_11b_vision.value,
-    },
-    "together": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_11b_vision.value,
-    },
-}
-
-
-class Report:
-    def __init__(self, output_path):
-        valid_file_format = (
-            output_path.split(".")[1] in ["md", "markdown"] if len(output_path.split(".")) == 2 else False
-        )
-        if not valid_file_format:
-            raise ValueError(f"Invalid output file {output_path}. Markdown file is required")
-        self.output_path = output_path
-        self.test_data = defaultdict(dict)
-        self.inference_tests = defaultdict(dict)
-
-    @pytest.hookimpl
-    def pytest_runtest_logreport(self, report):
-        # This hook is called in several phases, including setup, call and teardown
-        # The test is considered failed / error if any of the outcomes is not "Passed"
-        outcome = _process_outcome(report)
-        data = {
-            "outcome": report.outcome,
-            "longrepr": report.longrepr,
-            "name": report.nodeid,
-        }
-        if report.nodeid not in self.test_data:
-            self.test_data[report.nodeid] = data
-        elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
-            self.test_data[report.nodeid] = data
-
-    @pytest.hookimpl
-    def pytest_sessionfinish(self, session, exitstatus):
-        if exitstatus <= ExitCode.INTERRUPTED:
-            return
-        report = []
-        report.append("# Llama Stack Integration Test Results Report")
-        report.append("\n## Summary")
-        report.append("\n## Supported Models: ")
-
-        header = "| Model Descriptor |"
-        dividor = "|:---|"
-        for k in SUPPORTED_MODELS.keys():
-            header += f"{k} |"
-            dividor += ":---:|"
-
-        report.append(header)
-        report.append(dividor)
-
-        rows = []
-        for model in all_registered_models():
-            if "Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value:
-                continue
-            row = f"| {model.core_model_id.value} |"
-            for k in SUPPORTED_MODELS.keys():
-                if model.core_model_id.value in SUPPORTED_MODELS[k]:
-                    row += " ✅ |"
-                else:
-                    row += " ❌ |"
-            rows.append(row)
-        report.extend(rows)
-
-        report.append("\n### Tests:")
-
-        for provider in SUPPORTED_MODELS.keys():
-            if provider not in self.inference_tests:
-                continue
-            report.append(f"\n #### {provider}")
-            test_table = [
-                "| Area | Model | API | Functionality Test | Status |",
-                "|:-----|:-----|:-----|:-----|:-----|",
-            ]
-            for api in INFERENCE_APIS:
-                tests = self.inference_tests[provider][api]
-                for test_nodeid in tests:
-                    row = "|{area} | {model} | {api} | {test} | {result} ".format(
-                        area="Text" if "text" in test_nodeid else "Vision",
-                        model=("Llama-3.1-8B-Instruct" if "text" in test_nodeid else "Llama3.2-11B-Vision-Instruct"),
-                        api=f"/{api}",
-                        test=self.get_simple_function_name(test_nodeid),
-                        result=("✅" if self.test_data[test_nodeid]["outcome"] == "passed" else "❌"),
-                    )
-                    test_table += [row]
-            report.extend(test_table)
-            report.append("\n")
-
-        output_file = Path(self.output_path)
-        output_file.write_text("\n".join(report))
-        print(f"\n Report generated: {output_file.absolute()}")
-
-    @pytest.hookimpl(trylast=True)
-    def pytest_collection_modifyitems(self, session, config, items):
-        for item in items:
-            inference = item.callspec.params.get("inference_stack")
-            if "inference" in item.nodeid:
-                func_name = getattr(item, "originalname", item.name)
-                for api in INFERENCE_APIS:
-                    if api in func_name:
-                        api_tests = self.inference_tests[inference].get(api, set())
-                        api_tests.add(item.nodeid)
-                        self.inference_tests[inference][api] = api_tests
-
-    def get_simple_function_name(self, nodeid):
-        """Extract function name from nodeid.
-
-        Examples:
-        - 'tests/test_math.py::test_addition' -> 'test_addition'
-        - 'tests/test_math.py::TestClass::test_method' -> test_method'
-        """
-        parts = nodeid.split("::")
-        func_name = nodeid  # Fallback to full nodeid if pattern doesn't match
-        if len(parts) == 2:  # Simple function
-            func_name = parts[1]
-        elif len(parts) == 3:  # Class method
-            func_name = parts[2]
-        return func_name.split("[")[0]
diff --git a/llama_stack/providers/utils/bedrock/config.py b/llama_stack/providers/utils/bedrock/config.py
index 95019666b..b25617d76 100644
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@@ -3,54 +3,53 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Optional
 
 from pydantic import BaseModel, Field
 
 
 class BedrockBaseConfig(BaseModel):
-    aws_access_key_id: Optional[str] = Field(
+    aws_access_key_id: str | None = Field(
         default=None,
         description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
     )
-    aws_secret_access_key: Optional[str] = Field(
+    aws_secret_access_key: str | None = Field(
         default=None,
         description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
     )
-    aws_session_token: Optional[str] = Field(
+    aws_session_token: str | None = Field(
         default=None,
         description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
     )
-    region_name: Optional[str] = Field(
+    region_name: str | None = Field(
         default=None,
         description="The default AWS Region to use, for example, us-west-1 or us-west-2."
         "Default use environment variable: AWS_DEFAULT_REGION",
     )
-    profile_name: Optional[str] = Field(
+    profile_name: str | None = Field(
         default=None,
         description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
     )
-    total_max_attempts: Optional[int] = Field(
+    total_max_attempts: int | None = Field(
         default=None,
         description="An integer representing the maximum number of attempts that will be made for a single request, "
         "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
     )
-    retry_mode: Optional[str] = Field(
+    retry_mode: str | None = Field(
         default=None,
         description="A string representing the type of retries Boto3 will perform."
         "Default use environment variable: AWS_RETRY_MODE",
     )
-    connect_timeout: Optional[float] = Field(
+    connect_timeout: float | None = Field(
         default=60,
         description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
         "The default is 60 seconds.",
     )
-    read_timeout: Optional[float] = Field(
+    read_timeout: float | None = Field(
         default=60,
         description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
         "The default is 60 seconds.",
     )
-    session_ttl: Optional[int] = Field(
+    session_ttl: int | None = Field(
         default=3600,
         description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
     )
diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py
index eb9d9dd60..28a243863 100644
--- a/llama_stack/providers/utils/common/data_schema_validator.py
+++ b/llama_stack/providers/utils/common/data_schema_validator.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
-from typing import Any, Dict, List
+from typing import Any
 
 from llama_stack.apis.common.type_system import (
     ChatCompletionInputType,
@@ -85,16 +85,16 @@ def get_valid_schemas(api_str: str):
 
 
 def validate_dataset_schema(
-    dataset_schema: Dict[str, Any],
-    expected_schemas: List[Dict[str, Any]],
+    dataset_schema: dict[str, Any],
+    expected_schemas: list[dict[str, Any]],
 ):
     if dataset_schema not in expected_schemas:
         raise ValueError(f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}")
 
 
 def validate_row_schema(
-    input_row: Dict[str, Any],
-    expected_schemas: List[Dict[str, Any]],
+    input_row: dict[str, Any],
+    expected_schemas: list[dict[str, Any]],
 ):
     for schema in expected_schemas:
         if all(key in input_row for key in schema):
diff --git a/llama_stack/providers/utils/inference/__init__.py b/llama_stack/providers/utils/inference/__init__.py
index e36be9404..66269d173 100644
--- a/llama_stack/providers/utils/inference/__init__.py
+++ b/llama_stack/providers/utils/inference/__init__.py
@@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
-
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.models.llama.sku_types import *  # noqa: F403
 
@@ -22,7 +20,7 @@ def is_supported_safety_model(model: Model) -> bool:
     ]
 
 
-def supported_inference_models() -> List[Model]:
+def supported_inference_models() -> list[Model]:
     return [
         m
         for m in all_registered_models()
diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py
index 8b14c7502..97cf87360 100644
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import base64
 import logging
-from typing import TYPE_CHECKING, List, Optional
+import struct
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from sentence_transformers import SentenceTransformer
@@ -15,6 +17,9 @@ from llama_stack.apis.inference import (
     EmbeddingTaskType,
     InterleavedContentItem,
     ModelStore,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     TextTruncation,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
@@ -31,10 +36,10 @@ class SentenceTransformerEmbeddingMixin:
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
         embedding_model = self._load_sentence_transformer_model(model.provider_resource_id)
@@ -43,6 +48,50 @@ class SentenceTransformerEmbeddingMixin:
         )
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        # Convert input to list format if it's a single string
+        input_list = [input] if isinstance(input, str) else input
+        if not input_list:
+            raise ValueError("Empty list not supported")
+
+        # Get the model and generate embeddings
+        model_obj = await self.model_store.get_model(model)
+        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+
+        # Convert embeddings to the requested format
+        data = []
+        for i, embedding in enumerate(embeddings):
+            if encoding_format == "base64":
+                # Convert float array to base64 string
+                float_bytes = struct.pack(f"{len(embedding)}f", *embedding)
+                embedding_value = base64.b64encode(float_bytes).decode("ascii")
+            else:
+                # Default to float format
+                embedding_value = embedding.tolist()
+
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_value,
+                    index=i,
+                )
+            )
+
+        # Not returning actual token usage
+        usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=model_obj.provider_resource_id,
+            usage=usage,
+        )
+
     def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
         global EMBEDDING_MODELS
 
diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
new file mode 100644
index 000000000..7b6bc2e3d
--- /dev/null
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.inference import (
+    ListOpenAIChatCompletionResponse,
+    OpenAIChatCompletion,
+    OpenAICompletionWithInputMessages,
+    OpenAIMessageParam,
+    Order,
+)
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+
+from ..sqlstore.api import ColumnDefinition, ColumnType
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+
+
+class InferenceStore:
+    def __init__(self, sql_store_config: SqlStoreConfig):
+        if not sql_store_config:
+            sql_store_config = SqliteSqlStoreConfig(
+                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+            )
+        self.sql_store_config = sql_store_config
+        self.sql_store = None
+
+    async def initialize(self):
+        """Create the necessary tables if they don't exist."""
+        self.sql_store = sqlstore_impl(self.sql_store_config)
+        await self.sql_store.create_table(
+            "chat_completions",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "created": ColumnType.INTEGER,
+                "model": ColumnType.STRING,
+                "choices": ColumnType.JSON,
+                "input_messages": ColumnType.JSON,
+            },
+        )
+
+    async def store_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None:
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        data = chat_completion.model_dump()
+
+        await self.sql_store.insert(
+            "chat_completions",
+            {
+                "id": data["id"],
+                "created": data["created"],
+                "model": data["model"],
+                "choices": data["choices"],
+                "input_messages": [message.model_dump() for message in input_messages],
+            },
+        )
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """
+        List chat completions from the database.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by.
+        """
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        # TODO: support after
+        if after:
+            raise NotImplementedError("After is not supported for SQLite")
+        if not order:
+            order = Order.desc
+
+        rows = await self.sql_store.fetch_all(
+            "chat_completions",
+            where={"model": model} if model else None,
+            order_by=[("created", order.value)],
+            limit=limit,
+        )
+
+        data = [
+            OpenAICompletionWithInputMessages(
+                id=row["id"],
+                created=row["created"],
+                model=row["model"],
+                choices=row["choices"],
+                input_messages=row["input_messages"],
+            )
+            for row in rows
+        ]
+        return ListOpenAIChatCompletionResponse(
+            data=data,
+            # TODO: implement has_more
+            has_more=False,
+            first_id=data[0].id if data else "",
+            last_id=data[-1].id if data else "",
+        )
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        row = await self.sql_store.fetch_one("chat_completions", where={"id": completion_id})
+        if not row:
+            raise ValueError(f"Chat completion with id {completion_id} not found") from None
+        return OpenAICompletionWithInputMessages(
+            id=row["id"],
+            created=row["created"],
+            model=row["model"],
+            choices=row["choices"],
+            input_messages=row["input_messages"],
+        )
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index efe7031f5..dab10bc55 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -4,7 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+import base64
+import struct
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any
 
 import litellm
 
@@ -18,7 +21,7 @@ from llama_stack.apis.inference import (
     ChatCompletionResponseStreamChunk,
     EmbeddingsResponse,
     EmbeddingTaskType,
-    Inference,
+    InferenceProvider,
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
@@ -34,6 +37,9 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
     OpenAICompletion,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     OpenAIMessageParam,
     OpenAIResponseFormatParam,
 )
@@ -58,13 +64,16 @@ logger = get_logger(name=__name__, category="inference")
 
 class LiteLLMOpenAIMixin(
     ModelRegistryHelper,
-    Inference,
+    InferenceProvider,
     NeedsRequestProviderData,
 ):
+    # TODO: avoid exposing the litellm specific model names to the user.
+    #       potential change: add a prefix param that gets added to the model name
+    #                         when calling litellm.
     def __init__(
         self,
         model_entries,
-        api_key_from_config: Optional[str],
+        api_key_from_config: str | None,
         provider_data_api_key_field: str,
         openai_compat_api_base: str | None = None,
     ):
@@ -90,30 +99,35 @@ class LiteLLMOpenAIMixin(
             raise ValueError(f"Unsupported model: {model.provider_resource_id}")
         return model
 
+    def get_litellm_model_name(self, model_id: str) -> str:
+        # users may be using openai/ prefix in their model names. the openai/models.py did this by default.
+        # model_id.startswith("openai/") is for backwards compatibility.
+        return "openai/" + model_id if self.is_openai_compat and not model_id.startswith("openai/") else model_id
+
     async def completion(
         self,
         model_id: str,
         content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
     ) -> AsyncGenerator:
         raise NotImplementedError("LiteLLM does not support completion requests")
 
     async def chat_completion(
         self,
         model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
         if sampling_params is None:
             sampling_params = SamplingParams()
 
@@ -130,8 +144,7 @@ class LiteLLMOpenAIMixin(
         )
 
         params = await self._get_params(request)
-        if self.is_openai_compat:
-            params["model"] = "openai/" + params["model"]
+        params["model"] = self.get_litellm_model_name(params["model"])
 
         logger.debug(f"params to litellm (openai compat): {params}")
         # unfortunately, we need to use synchronous litellm.completion here because litellm
@@ -220,65 +233,113 @@ class LiteLLMOpenAIMixin(
                     else request.tool_config.tool_choice
                 )
 
-        provider_data = self.get_request_provider_data()
-        key_field = self.provider_data_api_key_field
-        if provider_data and getattr(provider_data, key_field, None):
-            api_key = getattr(provider_data, key_field)
-        else:
-            api_key = self.api_key_from_config
-
         return {
             "model": request.model,
-            "api_key": api_key,
+            "api_key": self.get_api_key(),
             "api_base": self.api_base,
             **input_dict,
             "stream": request.stream,
             **get_sampling_options(request.sampling_params),
         }
 
+    def get_api_key(self) -> str:
+        provider_data = self.get_request_provider_data()
+        key_field = self.provider_data_api_key_field
+        if provider_data and getattr(provider_data, key_field, None):
+            api_key = getattr(provider_data, key_field)
+        else:
+            api_key = self.api_key_from_config
+        return api_key
+
     async def embeddings(
         self,
         model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
     ) -> EmbeddingsResponse:
         model = await self.model_store.get_model(model_id)
 
         response = litellm.embedding(
-            model=model.provider_resource_id,
+            model=self.get_litellm_model_name(model.provider_resource_id),
             input=[interleaved_content_as_str(content) for content in contents],
         )
 
         embeddings = [data["embedding"] for data in response["data"]]
         return EmbeddingsResponse(embeddings=embeddings)
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        model_obj = await self.model_store.get_model(model)
+
+        # Convert input to list if it's a string
+        input_list = [input] if isinstance(input, str) else input
+
+        # Call litellm embedding function
+        # litellm.drop_params = True
+        response = litellm.embedding(
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
+            input=input_list,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
+            dimensions=dimensions,
+        )
+
+        # Convert response to OpenAI format
+        data = []
+        for i, embedding_data in enumerate(response["data"]):
+            # we encode to base64 if the encoding format is base64 in the request
+            if encoding_format == "base64":
+                byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
+                embedding = base64.b64encode(byte_data).decode("utf-8")
+            else:
+                embedding = embedding_data["embedding"]
+
+            data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response["usage"]["prompt_tokens"],
+            total_tokens=response["usage"]["total_tokens"],
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=model_obj.provider_resource_id,
+            usage=usage,
+        )
+
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         model_obj = await self.model_store.get_model(model)
         params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
             prompt=prompt,
             best_of=best_of,
             echo=echo,
@@ -297,38 +358,40 @@ class LiteLLMOpenAIMixin(
             user=user,
             guided_choice=guided_choice,
             prompt_logprobs=prompt_logprobs,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
         )
         return await litellm.atext_completion(**params)
 
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self.model_store.get_model(model)
         params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
+            model=self.get_litellm_model_name(model_obj.provider_resource_id),
             messages=messages,
             frequency_penalty=frequency_penalty,
             function_call=function_call,
@@ -351,27 +414,29 @@ class LiteLLMOpenAIMixin(
             top_logprobs=top_logprobs,
             top_p=top_p,
             user=user,
+            api_key=self.get_api_key(),
+            api_base=self.api_base,
         )
         return await litellm.acompletion(**params)
 
     async def batch_completion(
         self,
         model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch completion is not supported for OpenAI Compat")
 
     async def batch_chat_completion(
         self,
         model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
     ):
         raise NotImplementedError("Batch chat completion is not supported for OpenAI Compat")
diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py
index 4d7063953..d707e36c2 100644
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from pydantic import BaseModel, Field
 
@@ -20,13 +20,13 @@ from llama_stack.providers.utils.inference import (
 # more closer to the Model class.
 class ProviderModelEntry(BaseModel):
     provider_model_id: str
-    aliases: List[str] = Field(default_factory=list)
-    llama_model: Optional[str] = None
+    aliases: list[str] = Field(default_factory=list)
+    llama_model: str | None = None
     model_type: ModelType = ModelType.llm
-    metadata: Dict[str, Any] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
 
 
-def get_huggingface_repo(model_descriptor: str) -> Optional[str]:
+def get_huggingface_repo(model_descriptor: str) -> str | None:
     for model in all_registered_models():
         if model.descriptor() == model_descriptor:
             return model.huggingface_repo
@@ -34,7 +34,7 @@ def get_huggingface_repo(model_descriptor: str) -> Optional[str]:
 
 
 def build_hf_repo_model_entry(
-    provider_model_id: str, model_descriptor: str, additional_aliases: Optional[List[str]] = None
+    provider_model_id: str, model_descriptor: str, additional_aliases: list[str] | None = None
 ) -> ProviderModelEntry:
     aliases = [
         get_huggingface_repo(model_descriptor),
@@ -58,7 +58,7 @@ def build_model_entry(provider_model_id: str, model_descriptor: str) -> Provider
 
 
 class ModelRegistryHelper(ModelsProtocolPrivate):
-    def __init__(self, model_entries: List[ProviderModelEntry]):
+    def __init__(self, model_entries: list[ProviderModelEntry]):
         self.alias_to_provider_id_map = {}
         self.provider_id_to_llama_model_map = {}
         for entry in model_entries:
@@ -72,43 +72,53 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
                 self.alias_to_provider_id_map[entry.llama_model] = entry.provider_model_id
                 self.provider_id_to_llama_model_map[entry.provider_model_id] = entry.llama_model
 
-    def get_provider_model_id(self, identifier: str) -> Optional[str]:
+    def get_provider_model_id(self, identifier: str) -> str | None:
         return self.alias_to_provider_id_map.get(identifier, None)
 
-    def get_llama_model(self, provider_model_id: str) -> Optional[str]:
+    # TODO: why keep a separate llama model mapping?
+    def get_llama_model(self, provider_model_id: str) -> str | None:
         return self.provider_id_to_llama_model_map.get(provider_model_id, None)
 
     async def register_model(self, model: Model) -> Model:
+        if not (supported_model_id := self.get_provider_model_id(model.provider_resource_id)):
+            raise ValueError(
+                f"Model '{model.provider_resource_id}' is not supported. Supported models are: {', '.join(self.alias_to_provider_id_map.keys())}"
+            )
+        provider_resource_id = self.get_provider_model_id(model.model_id)
         if model.model_type == ModelType.embedding:
             # embedding models are always registered by their provider model id and does not need to be mapped to a llama model
             provider_resource_id = model.provider_resource_id
-        else:
-            provider_resource_id = self.get_provider_model_id(model.provider_resource_id)
-
         if provider_resource_id:
-            model.provider_resource_id = provider_resource_id
+            if provider_resource_id != supported_model_id:  # be idemopotent, only reject differences
+                raise ValueError(
+                    f"Model id '{model.model_id}' is already registered. Please use a different id or unregister it first."
+                )
         else:
             llama_model = model.metadata.get("llama_model")
-            if llama_model is None:
-                return model
+            if llama_model:
+                existing_llama_model = self.get_llama_model(model.provider_resource_id)
+                if existing_llama_model:
+                    if existing_llama_model != llama_model:
+                        raise ValueError(
+                            f"Provider model id '{model.provider_resource_id}' is already registered to a different llama model: '{existing_llama_model}'"
+                        )
+                else:
+                    if llama_model not in ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR:
+                        raise ValueError(
+                            f"Invalid llama_model '{llama_model}' specified in metadata. "
+                            f"Must be one of: {', '.join(ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR.keys())}"
+                        )
+                    self.provider_id_to_llama_model_map[model.provider_resource_id] = (
+                        ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[llama_model]
+                    )
 
-            existing_llama_model = self.get_llama_model(model.provider_resource_id)
-            if existing_llama_model:
-                if existing_llama_model != llama_model:
-                    raise ValueError(
-                        f"Provider model id '{model.provider_resource_id}' is already registered to a different llama model: '{existing_llama_model}'"
-                    )
-            else:
-                if llama_model not in ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR:
-                    raise ValueError(
-                        f"Invalid llama_model '{llama_model}' specified in metadata. "
-                        f"Must be one of: {', '.join(ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR.keys())}"
-                    )
-                self.provider_id_to_llama_model_map[model.provider_resource_id] = (
-                    ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[llama_model]
-                )
+        self.alias_to_provider_id_map[model.model_id] = supported_model_id
 
         return model
 
     async def unregister_model(self, model_id: str) -> None:
-        pass
+        # TODO: should we block unregistering base supported provider model IDs?
+        if model_id not in self.alias_to_provider_id_map:
+            raise ValueError(f"Model id '{model_id}' is not registered.")
+
+        del self.alias_to_provider_id_map[model_id]
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index 4d690287b..049f06fdb 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -8,16 +8,9 @@ import logging
 import time
 import uuid
 import warnings
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Iterable
 from typing import (
     Any,
-    AsyncGenerator,
-    AsyncIterator,
-    Awaitable,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Union,
 )
 
 from openai import AsyncStream
@@ -115,6 +108,7 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletion,
     OpenAICompletion,
     OpenAICompletionChoice,
+    OpenAIMessageParam,
     OpenAIResponseFormatParam,
     ToolConfig,
 )
@@ -141,24 +135,24 @@ class OpenAICompatCompletionChoiceDelta(BaseModel):
 
 
 class OpenAICompatLogprobs(BaseModel):
-    text_offset: Optional[List[int]] = None
+    text_offset: list[int] | None = None
 
-    token_logprobs: Optional[List[float]] = None
+    token_logprobs: list[float] | None = None
 
-    tokens: Optional[List[str]] = None
+    tokens: list[str] | None = None
 
-    top_logprobs: Optional[List[Dict[str, float]]] = None
+    top_logprobs: list[dict[str, float]] | None = None
 
 
 class OpenAICompatCompletionChoice(BaseModel):
-    finish_reason: Optional[str] = None
-    text: Optional[str] = None
-    delta: Optional[OpenAICompatCompletionChoiceDelta] = None
-    logprobs: Optional[OpenAICompatLogprobs] = None
+    finish_reason: str | None = None
+    text: str | None = None
+    delta: OpenAICompatCompletionChoiceDelta | None = None
+    logprobs: OpenAICompatLogprobs | None = None
 
 
 class OpenAICompatCompletionResponse(BaseModel):
-    choices: List[OpenAICompatCompletionChoice]
+    choices: list[OpenAICompatCompletionChoice]
 
 
 def get_sampling_strategy_options(params: SamplingParams) -> dict:
@@ -217,8 +211,8 @@ def get_stop_reason(finish_reason: str) -> StopReason:
 
 
 def convert_openai_completion_logprobs(
-    logprobs: Optional[OpenAICompatLogprobs],
-) -> Optional[List[TokenLogProbs]]:
+    logprobs: OpenAICompatLogprobs | None,
+) -> list[TokenLogProbs] | None:
     if not logprobs:
         return None
     if hasattr(logprobs, "top_logprobs"):
@@ -235,7 +229,7 @@ def convert_openai_completion_logprobs(
     return None
 
 
-def convert_openai_completion_logprobs_stream(text: str, logprobs: Optional[Union[float, OpenAICompatLogprobs]]):
+def convert_openai_completion_logprobs_stream(text: str, logprobs: float | OpenAICompatLogprobs | None):
     if logprobs is None:
         return None
     if isinstance(logprobs, float):
@@ -532,13 +526,24 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
     if hasattr(message, "tool_calls") and message.tool_calls:
         result["tool_calls"] = []
         for tc in message.tool_calls:
+            # The tool.tool_name can be a str or a BuiltinTool enum. If
+            # it's the latter, convert to a string.
+            tool_name = tc.tool_name
+            if isinstance(tool_name, BuiltinTool):
+                tool_name = tool_name.value
+
+            # arguments_json can be None, so attempt it first and fall back to arguments
+            if hasattr(tc, "arguments_json") and tc.arguments_json:
+                arguments = tc.arguments_json
+            else:
+                arguments = json.dumps(tc.arguments)
             result["tool_calls"].append(
                 {
                     "id": tc.call_id,
                     "type": "function",
                     "function": {
-                        "name": tc.tool_name,
-                        "arguments": tc.arguments_json if hasattr(tc, "arguments_json") else json.dumps(tc.arguments),
+                        "name": tool_name,
+                        "arguments": arguments,
                     },
                 }
             )
@@ -557,7 +562,7 @@ class UnparseableToolCall(BaseModel):
 
 
 async def convert_message_to_openai_dict_new(
-    message: Message | Dict,
+    message: Message | dict,
 ) -> OpenAIChatCompletionMessage:
     """
     Convert a Message to an OpenAI API-compatible dictionary.
@@ -586,14 +591,10 @@ async def convert_message_to_openai_dict_new(
     #  List[...] -> List[...]
     async def _convert_message_content(
         content: InterleavedContent,
-    ) -> Union[str, Iterable[OpenAIChatCompletionContentPartParam]]:
+    ) -> str | Iterable[OpenAIChatCompletionContentPartParam]:
         async def impl(
             content_: InterleavedContent,
-        ) -> Union[
-            str,
-            OpenAIChatCompletionContentPartParam,
-            List[OpenAIChatCompletionContentPartParam],
-        ]:
+        ) -> str | OpenAIChatCompletionContentPartParam | list[OpenAIChatCompletionContentPartParam]:
             # Llama Stack and OpenAI spec match for str and text input
             if isinstance(content_, str):
                 return content_
@@ -638,10 +639,13 @@ async def convert_message_to_openai_dict_new(
             )
             for tool in message.tool_calls
         ]
+        params = {}
+        if tool_calls:
+            params["tool_calls"] = tool_calls
         out = OpenAIChatCompletionAssistantMessage(
             role="assistant",
             content=await _convert_message_content(message.content),
-            tool_calls=tool_calls or None,
+            **params,
         )
     elif isinstance(message, ToolResponseMessage):
         out = OpenAIChatCompletionToolMessage(
@@ -662,7 +666,7 @@ async def convert_message_to_openai_dict_new(
 
 def convert_tool_call(
     tool_call: ChatCompletionMessageToolCall,
-) -> Union[ToolCall, UnparseableToolCall]:
+) -> ToolCall | UnparseableToolCall:
     """
     Convert a ChatCompletionMessageToolCall tool call to either a
     ToolCall or UnparseableToolCall. Returns an UnparseableToolCall
@@ -838,7 +842,7 @@ def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
     }.get(finish_reason, StopReason.end_of_turn)
 
 
-def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[str, Any]]] = None) -> ToolConfig:
+def _convert_openai_request_tool_config(tool_choice: str | dict[str, Any] | None = None) -> ToolConfig:
     tool_config = ToolConfig()
     if tool_choice:
         try:
@@ -849,7 +853,7 @@ def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[st
     return tool_config
 
 
-def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None) -> List[ToolDefinition]:
+def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) -> list[ToolDefinition]:
     lls_tools = []
     if not tools:
         return lls_tools
@@ -865,7 +869,7 @@ def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None)
             tool_param_properties = tool_params.get("properties", {})
             for tool_param_key, tool_param_value in tool_param_properties.items():
                 tool_param_def = ToolParamDefinition(
-                    param_type=tool_param_value.get("type", None),
+                    param_type=str(tool_param_value.get("type", None)),
                     description=tool_param_value.get("description", None),
                 )
                 lls_tool_params[tool_param_key] = tool_param_def
@@ -895,8 +899,8 @@ def _convert_openai_request_response_format(
 
 
 def _convert_openai_tool_calls(
-    tool_calls: List[OpenAIChatCompletionMessageToolCall],
-) -> List[ToolCall]:
+    tool_calls: list[OpenAIChatCompletionMessageToolCall],
+) -> list[ToolCall]:
     """
     Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
 
@@ -932,7 +936,7 @@ def _convert_openai_tool_calls(
 
 def _convert_openai_logprobs(
     logprobs: OpenAIChoiceLogprobs,
-) -> Optional[List[TokenLogProbs]]:
+) -> list[TokenLogProbs] | None:
     """
     Convert an OpenAI ChoiceLogprobs into a list of TokenLogProbs.
 
@@ -965,9 +969,9 @@ def _convert_openai_logprobs(
 
 
 def _convert_openai_sampling_params(
-    max_tokens: Optional[int] = None,
-    temperature: Optional[float] = None,
-    top_p: Optional[float] = None,
+    max_tokens: int | None = None,
+    temperature: float | None = None,
+    top_p: float | None = None,
 ) -> SamplingParams:
     sampling_params = SamplingParams()
 
@@ -990,20 +994,20 @@ def _convert_openai_sampling_params(
 
 
 def openai_messages_to_messages(
-    messages: List[OpenAIChatCompletionMessage],
-) -> List[Message]:
+    messages: list[OpenAIMessageParam],
+) -> list[Message]:
     """
     Convert a list of OpenAIChatCompletionMessage into a list of Message.
     """
     converted_messages = []
     for message in messages:
         if message.role == "system":
-            converted_message = SystemMessage(content=message.content)
+            converted_message = SystemMessage(content=openai_content_to_content(message.content))
         elif message.role == "user":
             converted_message = UserMessage(content=openai_content_to_content(message.content))
         elif message.role == "assistant":
             converted_message = CompletionMessage(
-                content=message.content,
+                content=openai_content_to_content(message.content),
                 tool_calls=_convert_openai_tool_calls(message.tool_calls),
                 stop_reason=StopReason.end_of_turn,
             )
@@ -1019,7 +1023,7 @@ def openai_messages_to_messages(
     return converted_messages
 
 
-def openai_content_to_content(content: Union[str, Iterable[OpenAIChatCompletionContentPartParam]]):
+def openai_content_to_content(content: str | Iterable[OpenAIChatCompletionContentPartParam]):
     if isinstance(content, str):
         return content
     elif isinstance(content, list):
@@ -1265,24 +1269,24 @@ class OpenAICompletionToLlamaStackMixin:
     async def openai_completion(
         self,
         model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
     ) -> OpenAICompletion:
         if stream:
             raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
@@ -1334,29 +1338,29 @@ class OpenAIChatCompletionToLlamaStackMixin:
     async def openai_chat_completion(
         self,
         model: str,
-        messages: List[OpenAIChatCompletionMessage],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         messages = openai_messages_to_messages(messages)
         response_format = _convert_openai_request_response_format(response_format)
         sampling_params = _convert_openai_sampling_params(
@@ -1395,12 +1399,11 @@ class OpenAIChatCompletionToLlamaStackMixin:
     async def _process_stream_response(
         self,
         model: str,
-        outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
+        outstanding_responses: list[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
     ):
         id = f"chatcmpl-{uuid.uuid4()}"
-        for outstanding_response in outstanding_responses:
+        for i, outstanding_response in enumerate(outstanding_responses):
             response = await outstanding_response
-            i = 0
             async for chunk in response:
                 event = chunk.event
                 finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
@@ -1455,10 +1458,9 @@ class OpenAIChatCompletionToLlamaStackMixin:
                             model=model,
                             object="chat.completion.chunk",
                         )
-                i = i + 1
 
     async def _process_non_stream_response(
-        self, model: str, outstanding_responses: List[Awaitable[ChatCompletionResponse]]
+        self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
     ) -> OpenAIChatCompletion:
         choices = []
         for outstanding_response in outstanding_responses:
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index 657dc4b86..56e33cfdf 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -9,7 +9,6 @@ import base64
 import io
 import json
 import re
-from typing import List, Optional, Tuple, Union
 
 import httpx
 from PIL import Image as PIL_Image
@@ -63,7 +62,7 @@ log = get_logger(name=__name__, category="inference")
 
 
 class ChatCompletionRequestWithRawContent(ChatCompletionRequest):
-    messages: List[RawMessage]
+    messages: list[RawMessage]
 
 
 class CompletionRequestWithRawContent(CompletionRequest):
@@ -93,8 +92,8 @@ def interleaved_content_as_str(content: InterleavedContent, sep: str = " ") -> s
 
 
 async def convert_request_to_raw(
-    request: Union[ChatCompletionRequest, CompletionRequest],
-) -> Union[ChatCompletionRequestWithRawContent, CompletionRequestWithRawContent]:
+    request: ChatCompletionRequest | CompletionRequest,
+) -> ChatCompletionRequestWithRawContent | CompletionRequestWithRawContent:
     if isinstance(request, ChatCompletionRequest):
         messages = []
         for m in request.messages:
@@ -170,18 +169,18 @@ def content_has_media(content: InterleavedContent):
         return _has_media_content(content)
 
 
-def messages_have_media(messages: List[Message]):
+def messages_have_media(messages: list[Message]):
     return any(content_has_media(m.content) for m in messages)
 
 
-def request_has_media(request: Union[ChatCompletionRequest, CompletionRequest]):
+def request_has_media(request: ChatCompletionRequest | CompletionRequest):
     if isinstance(request, ChatCompletionRequest):
         return messages_have_media(request.messages)
     else:
         return content_has_media(request.content)
 
 
-async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]:
+async def localize_image_content(media: ImageContentItem) -> tuple[bytes, str]:
     image = media.image
     if image.url and image.url.uri.startswith("http"):
         async with httpx.AsyncClient() as client:
@@ -228,7 +227,7 @@ async def completion_request_to_prompt(request: CompletionRequest) -> str:
 
 async def completion_request_to_prompt_model_input_info(
     request: CompletionRequest,
-) -> Tuple[str, int]:
+) -> tuple[str, int]:
     content = augment_content_with_response_format_prompt(request.response_format, request.content)
     request.content = content
     request = await convert_request_to_raw(request)
@@ -265,7 +264,7 @@ async def chat_completion_request_to_prompt(request: ChatCompletionRequest, llam
 
 async def chat_completion_request_to_model_input_info(
     request: ChatCompletionRequest, llama_model: str
-) -> Tuple[str, int]:
+) -> tuple[str, int]:
     messages = chat_completion_request_to_messages(request, llama_model)
     request.messages = messages
     request = await convert_request_to_raw(request)
@@ -284,7 +283,7 @@ async def chat_completion_request_to_model_input_info(
 def chat_completion_request_to_messages(
     request: ChatCompletionRequest,
     llama_model: str,
-) -> List[Message]:
+) -> list[Message]:
     """Reads chat completion request and augments the messages to handle tools.
     For eg. for llama_3_1, add system message with the appropriate tools or
     add user messsage for custom tools, etc.
@@ -323,7 +322,7 @@ def chat_completion_request_to_messages(
     return messages
 
 
-def response_format_prompt(fmt: Optional[ResponseFormat]):
+def response_format_prompt(fmt: ResponseFormat | None):
     if not fmt:
         return None
 
@@ -337,7 +336,7 @@ def response_format_prompt(fmt: Optional[ResponseFormat]):
 
 def augment_messages_for_tools_llama_3_1(
     request: ChatCompletionRequest,
-) -> List[Message]:
+) -> list[Message]:
     existing_messages = request.messages
     existing_system_message = None
     if existing_messages[0].role == Role.system.value:
@@ -383,7 +382,7 @@ def augment_messages_for_tools_llama_3_1(
 
     messages.append(SystemMessage(content=sys_content))
 
-    has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
+    has_custom_tools = request.tools is not None and any(isinstance(dfn.tool_name, str) for dfn in request.tools)
     if has_custom_tools:
         fmt = request.tool_config.tool_prompt_format or ToolPromptFormat.json
         if fmt == ToolPromptFormat.json:
@@ -406,7 +405,7 @@ def augment_messages_for_tools_llama_3_1(
 def augment_messages_for_tools_llama(
     request: ChatCompletionRequest,
     custom_tool_prompt_generator,
-) -> List[Message]:
+) -> list[Message]:
     existing_messages = request.messages
     existing_system_message = None
     if existing_messages[0].role == Role.system.value:
@@ -457,7 +456,7 @@ def augment_messages_for_tools_llama(
     return messages
 
 
-def _get_tool_choice_prompt(tool_choice: ToolChoice | str, tools: List[ToolDefinition]) -> str:
+def _get_tool_choice_prompt(tool_choice: ToolChoice | str, tools: list[ToolDefinition]) -> str:
     if tool_choice == ToolChoice.auto:
         return ""
     elif tool_choice == ToolChoice.required:
diff --git a/llama_stack/providers/utils/inference/stream_utils.py b/llama_stack/providers/utils/inference/stream_utils.py
new file mode 100644
index 000000000..a2edbb9c8
--- /dev/null
+++ b/llama_stack/providers/utils/inference/stream_utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import AsyncIterator
+from datetime import datetime, timezone
+from typing import Any
+
+from llama_stack.apis.inference import (
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIChoiceLogprobs,
+    OpenAIMessageParam,
+)
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+
+
+async def stream_and_store_openai_completion(
+    provider_stream: AsyncIterator[OpenAIChatCompletionChunk],
+    model: str,
+    store: InferenceStore,
+    input_messages: list[OpenAIMessageParam],
+) -> AsyncIterator[OpenAIChatCompletionChunk]:
+    """
+    Wraps a provider's stream, yields chunks, and stores the full completion at the end.
+    """
+    id = None
+    created = None
+    choices_data: dict[int, dict[str, Any]] = {}
+
+    try:
+        async for chunk in provider_stream:
+            if id is None and chunk.id:
+                id = chunk.id
+            if created is None and chunk.created:
+                created = chunk.created
+
+            if chunk.choices:
+                for choice_delta in chunk.choices:
+                    idx = choice_delta.index
+                    if idx not in choices_data:
+                        choices_data[idx] = {
+                            "content_parts": [],
+                            "tool_calls_builder": {},
+                            "finish_reason": None,
+                            "logprobs_content_parts": [],
+                        }
+                    current_choice_data = choices_data[idx]
+
+                    if choice_delta.delta:
+                        delta = choice_delta.delta
+                        if delta.content:
+                            current_choice_data["content_parts"].append(delta.content)
+                        if delta.tool_calls:
+                            for tool_call_delta in delta.tool_calls:
+                                tc_idx = tool_call_delta.index
+                                if tc_idx not in current_choice_data["tool_calls_builder"]:
+                                    # Initialize with correct structure for _ToolCallBuilderData
+                                    current_choice_data["tool_calls_builder"][tc_idx] = {
+                                        "id": None,
+                                        "type": "function",
+                                        "function_name_parts": [],
+                                        "function_arguments_parts": [],
+                                    }
+                                builder = current_choice_data["tool_calls_builder"][tc_idx]
+                                if tool_call_delta.id:
+                                    builder["id"] = tool_call_delta.id
+                                if tool_call_delta.type:
+                                    builder["type"] = tool_call_delta.type
+                                if tool_call_delta.function:
+                                    if tool_call_delta.function.name:
+                                        builder["function_name_parts"].append(tool_call_delta.function.name)
+                                    if tool_call_delta.function.arguments:
+                                        builder["function_arguments_parts"].append(tool_call_delta.function.arguments)
+                    if choice_delta.finish_reason:
+                        current_choice_data["finish_reason"] = choice_delta.finish_reason
+                    if choice_delta.logprobs and choice_delta.logprobs.content:
+                        # Ensure that we are extending with the correct type
+                        current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
+            yield chunk
+    finally:
+        if id:
+            assembled_choices: list[OpenAIChoice] = []
+            for choice_idx, choice_data in choices_data.items():
+                content_str = "".join(choice_data["content_parts"])
+                assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
+                if choice_data["tool_calls_builder"]:
+                    for tc_build_data in choice_data["tool_calls_builder"].values():
+                        if tc_build_data["id"]:
+                            func_name = "".join(tc_build_data["function_name_parts"])
+                            func_args = "".join(tc_build_data["function_arguments_parts"])
+                            assembled_tool_calls.append(
+                                OpenAIChatCompletionToolCall(
+                                    id=tc_build_data["id"],
+                                    type=tc_build_data["type"],  # No or "function" needed, already set
+                                    function=OpenAIChatCompletionToolCallFunction(name=func_name, arguments=func_args),
+                                )
+                            )
+                message = OpenAIAssistantMessageParam(
+                    role="assistant",
+                    content=content_str if content_str else None,
+                    tool_calls=assembled_tool_calls if assembled_tool_calls else None,
+                )
+                logprobs_content = choice_data["logprobs_content_parts"]
+                final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
+
+                assembled_choices.append(
+                    OpenAIChoice(
+                        finish_reason=choice_data["finish_reason"],
+                        index=choice_idx,
+                        message=message,
+                        logprobs=final_logprobs,
+                    )
+                )
+
+            final_response = OpenAIChatCompletion(
+                id=id,
+                choices=assembled_choices,
+                created=created or int(datetime.now(timezone.utc).timestamp()),
+                model=model,
+                object="chat.completion",
+            )
+            await store.store_chat_completion(final_response, input_messages)
diff --git a/llama_stack/providers/utils/kvstore/api.py b/llama_stack/providers/utils/kvstore/api.py
index 84b1730e1..d17dc66e1 100644
--- a/llama_stack/providers/utils/kvstore/api.py
+++ b/llama_stack/providers/utils/kvstore/api.py
@@ -5,15 +5,17 @@
 # the root directory of this source tree.
 
 from datetime import datetime
-from typing import List, Optional, Protocol
+from typing import Protocol
 
 
 class KVStore(Protocol):
     # TODO: make the value type bytes instead of str
-    async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None: ...
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None: ...
 
-    async def get(self, key: str) -> Optional[str]: ...
+    async def get(self, key: str) -> str | None: ...
 
     async def delete(self, key: str) -> None: ...
 
-    async def range(self, start_key: str, end_key: str) -> List[str]: ...
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]: ...
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]: ...
diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py
index 4f85982be..bbb0c5c0a 100644
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@@ -6,10 +6,9 @@
 
 import re
 from enum import Enum
-from typing import Literal, Optional, Union
+from typing import Annotated, Literal
 
 from pydantic import BaseModel, Field, field_validator
-from typing_extensions import Annotated
 
 from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 
@@ -22,7 +21,7 @@ class KVStoreType(Enum):
 
 
 class CommonConfig(BaseModel):
-    namespace: Optional[str] = Field(
+    namespace: str | None = Field(
         default=None,
         description="All keys will be prefixed with this namespace",
     )
@@ -66,10 +65,10 @@ class SqliteKVStoreConfig(CommonConfig):
 class PostgresKVStoreConfig(CommonConfig):
     type: Literal[KVStoreType.postgres.value] = KVStoreType.postgres.value
     host: str = "localhost"
-    port: int = 5432
+    port: str = "5432"
     db: str = "llamastack"
     user: str
-    password: Optional[str] = None
+    password: str | None = None
     table_name: str = "llamastack_kvstore"
 
     @classmethod
@@ -108,7 +107,7 @@ class MongoDBKVStoreConfig(CommonConfig):
     port: int = 27017
     db: str = "llamastack"
     user: str = None
-    password: Optional[str] = None
+    password: str | None = None
     collection_name: str = "llamastack_kvstore"
 
     @classmethod
@@ -126,6 +125,6 @@ class MongoDBKVStoreConfig(CommonConfig):
 
 
 KVStoreConfig = Annotated[
-    Union[RedisKVStoreConfig, SqliteKVStoreConfig, PostgresKVStoreConfig, MongoDBKVStoreConfig],
+    RedisKVStoreConfig | SqliteKVStoreConfig | PostgresKVStoreConfig | MongoDBKVStoreConfig,
     Field(discriminator="type", default=KVStoreType.sqlite.value),
 ]
diff --git a/llama_stack/providers/utils/kvstore/kvstore.py b/llama_stack/providers/utils/kvstore/kvstore.py
index 6bc175260..3a1ee8a26 100644
--- a/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/llama_stack/providers/utils/kvstore/kvstore.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Optional
 
 from .api import KVStore
 from .config import KVStoreConfig, KVStoreType
@@ -21,15 +20,22 @@ class InmemoryKVStoreImpl(KVStore):
     async def initialize(self) -> None:
         pass
 
-    async def get(self, key: str) -> Optional[str]:
+    async def get(self, key: str) -> str | None:
         return self._store.get(key)
 
     async def set(self, key: str, value: str) -> None:
         self._store[key] = value
 
-    async def range(self, start_key: str, end_key: str) -> List[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         return [self._store[key] for key in self._store.keys() if key >= start_key and key < end_key]
 
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        return [key for key in self._store.keys() if key >= start_key and key < end_key]
+
+    async def delete(self, key: str) -> None:
+        del self._store[key]
+
 
 async def kvstore_impl(config: KVStoreConfig) -> KVStore:
     if config.type == KVStoreType.redis.value:
diff --git a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
index c1581dc8d..3842773d9 100644
--- a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@@ -6,7 +6,6 @@
 
 import logging
 from datetime import datetime
-from typing import List, Optional
 
 from pymongo import AsyncMongoClient
 
@@ -43,12 +42,12 @@ class MongoDBKVStoreImpl(KVStore):
             return key
         return f"{self.config.namespace}:{key}"
 
-    async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
         key = self._namespaced_key(key)
         update_query = {"$set": {"value": value, "expiration": expiration}}
         await self.collection.update_one({"key": key}, update_query, upsert=True)
 
-    async def get(self, key: str) -> Optional[str]:
+    async def get(self, key: str) -> str | None:
         key = self._namespaced_key(key)
         query = {"key": key}
         result = await self.collection.find_one(query, {"value": 1, "_id": 0})
@@ -58,7 +57,7 @@ class MongoDBKVStoreImpl(KVStore):
         key = self._namespaced_key(key)
         await self.collection.delete_one({"key": key})
 
-    async def range(self, start_key: str, end_key: str) -> List[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
         query = {
@@ -69,3 +68,10 @@ class MongoDBKVStoreImpl(KVStore):
         async for doc in cursor:
             result.append(doc["value"])
         return result
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        start_key = self._namespaced_key(start_key)
+        end_key = self._namespaced_key(end_key)
+        query = {"key": {"$gte": start_key, "$lt": end_key}}
+        cursor = self.collection.find(query, {"key": 1, "_id": 0}).sort("key", 1)
+        return [doc["key"] for doc in cursor]
diff --git a/llama_stack/providers/utils/kvstore/postgres/postgres.py b/llama_stack/providers/utils/kvstore/postgres/postgres.py
index 097d36066..bd35decfc 100644
--- a/llama_stack/providers/utils/kvstore/postgres/postgres.py
+++ b/llama_stack/providers/utils/kvstore/postgres/postgres.py
@@ -6,7 +6,6 @@
 
 import logging
 from datetime import datetime
-from typing import List, Optional
 
 import psycopg2
 from psycopg2.extras import DictCursor
@@ -54,7 +53,7 @@ class PostgresKVStoreImpl(KVStore):
             return key
         return f"{self.config.namespace}:{key}"
 
-    async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
         key = self._namespaced_key(key)
         self.cursor.execute(
             f"""
@@ -66,7 +65,7 @@ class PostgresKVStoreImpl(KVStore):
             (key, value, expiration),
         )
 
-    async def get(self, key: str) -> Optional[str]:
+    async def get(self, key: str) -> str | None:
         key = self._namespaced_key(key)
         self.cursor.execute(
             f"""
@@ -86,7 +85,7 @@ class PostgresKVStoreImpl(KVStore):
             (key,),
         )
 
-    async def range(self, start_key: str, end_key: str) -> List[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
 
@@ -100,3 +99,13 @@ class PostgresKVStoreImpl(KVStore):
             (start_key, end_key),
         )
         return [row[0] for row in self.cursor.fetchall()]
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        start_key = self._namespaced_key(start_key)
+        end_key = self._namespaced_key(end_key)
+
+        self.cursor.execute(
+            f"SELECT key FROM {self.config.table_name} WHERE key >= %s AND key < %s",
+            (start_key, end_key),
+        )
+        return [row[0] for row in self.cursor.fetchall()]
diff --git a/llama_stack/providers/utils/kvstore/redis/redis.py b/llama_stack/providers/utils/kvstore/redis/redis.py
index a390ea866..3d2d956c3 100644
--- a/llama_stack/providers/utils/kvstore/redis/redis.py
+++ b/llama_stack/providers/utils/kvstore/redis/redis.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 
 from datetime import datetime
-from typing import List, Optional
 
 from redis.asyncio import Redis
 
@@ -25,13 +24,13 @@ class RedisKVStoreImpl(KVStore):
             return key
         return f"{self.config.namespace}:{key}"
 
-    async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
         key = self._namespaced_key(key)
         await self.redis.set(key, value)
         if expiration:
             await self.redis.expireat(key, expiration)
 
-    async def get(self, key: str) -> Optional[str]:
+    async def get(self, key: str) -> str | None:
         key = self._namespaced_key(key)
         value = await self.redis.get(key)
         if value is None:
@@ -43,7 +42,7 @@ class RedisKVStoreImpl(KVStore):
         key = self._namespaced_key(key)
         await self.redis.delete(key)
 
-    async def range(self, start_key: str, end_key: str) -> List[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
         cursor = 0
@@ -68,3 +67,10 @@ class RedisKVStoreImpl(KVStore):
             ]
 
         return []
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        matching_keys = await self.redis.zrangebylex(self.namespace, f"[{start_key}", f"[{end_key}")
+        if not matching_keys:
+            return []
+        return [k.decode("utf-8") for k in matching_keys]
diff --git a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
index bc0488aac..4e49e4d8c 100644
--- a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
+++ b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
@@ -6,7 +6,6 @@
 
 import os
 from datetime import datetime
-from typing import List, Optional
 
 import aiosqlite
 
@@ -33,7 +32,7 @@ class SqliteKVStoreImpl(KVStore):
             )
             await db.commit()
 
-    async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
         async with aiosqlite.connect(self.db_path) as db:
             await db.execute(
                 f"INSERT OR REPLACE INTO {self.table_name} (key, value, expiration) VALUES (?, ?, ?)",
@@ -41,7 +40,7 @@ class SqliteKVStoreImpl(KVStore):
             )
             await db.commit()
 
-    async def get(self, key: str) -> Optional[str]:
+    async def get(self, key: str) -> str | None:
         async with aiosqlite.connect(self.db_path) as db:
             async with db.execute(f"SELECT value, expiration FROM {self.table_name} WHERE key = ?", (key,)) as cursor:
                 row = await cursor.fetchone()
@@ -55,7 +54,7 @@ class SqliteKVStoreImpl(KVStore):
             await db.execute(f"DELETE FROM {self.table_name} WHERE key = ?", (key,))
             await db.commit()
 
-    async def range(self, start_key: str, end_key: str) -> List[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         async with aiosqlite.connect(self.db_path) as db:
             async with db.execute(
                 f"SELECT key, value, expiration FROM {self.table_name} WHERE key >= ? AND key <= ?",
@@ -66,3 +65,13 @@ class SqliteKVStoreImpl(KVStore):
                     _, value, _ = row
                     result.append(value)
                 return result
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        async with aiosqlite.connect(self.db_path) as db:
+            cursor = await db.execute(
+                f"SELECT key FROM {self.table_name} WHERE key >= ? AND key <= ?",
+                (start_key, end_key),
+            )
+            rows = await cursor.fetchall()
+            return [row[0] for row in rows]
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index ba4403ea1..4cd15860b 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -9,7 +9,7 @@ import logging
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any
 from urllib.parse import unquote
 
 import httpx
@@ -94,7 +94,7 @@ def content_from_data(data_url: str) -> str:
         return ""
 
 
-def concat_interleaved_content(content: List[InterleavedContent]) -> InterleavedContent:
+def concat_interleaved_content(content: list[InterleavedContent]) -> InterleavedContent:
     """concatenate interleaved content into a single list. ensure that 'str's are converted to TextContentItem when in a list"""
 
     ret = []
@@ -118,58 +118,86 @@ async def content_from_doc(doc: RAGDocument) -> str:
     if isinstance(doc.content, URL):
         if doc.content.uri.startswith("data:"):
             return content_from_data(doc.content.uri)
-        else:
-            async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content.uri)
-            if doc.mime_type == "application/pdf":
-                return parse_pdf(r.content)
-            else:
-                return r.text
-
-    pattern = re.compile("^(https?://|file://|data:)")
-    if pattern.match(doc.content):
-        if doc.content.startswith("data:"):
-            return content_from_data(doc.content)
-        else:
+        async with httpx.AsyncClient() as client:
+            r = await client.get(doc.content.uri)
+        if doc.mime_type == "application/pdf":
+            return parse_pdf(r.content)
+        return r.text
+    elif isinstance(doc.content, str):
+        pattern = re.compile("^(https?://|file://|data:)")
+        if pattern.match(doc.content):
+            if doc.content.startswith("data:"):
+                return content_from_data(doc.content)
             async with httpx.AsyncClient() as client:
                 r = await client.get(doc.content)
             if doc.mime_type == "application/pdf":
                 return parse_pdf(r.content)
-            else:
-                return r.text
-
-    return interleaved_content_as_str(doc.content)
+            return r.text
+        return doc.content
+    else:
+        # will raise ValueError if the content is not List[InterleavedContent] or InterleavedContent
+        return interleaved_content_as_str(doc.content)
 
 
-def make_overlapped_chunks(document_id: str, text: str, window_len: int, overlap_len: int) -> List[Chunk]:
+def make_overlapped_chunks(
+    document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
+) -> list[Chunk]:
     tokenizer = Tokenizer.get_instance()
     tokens = tokenizer.encode(text, bos=False, eos=False)
+    try:
+        metadata_string = str(metadata)
+    except Exception as e:
+        raise ValueError("Failed to serialize metadata to string") from e
+
+    metadata_tokens = tokenizer.encode(metadata_string, bos=False, eos=False)
 
     chunks = []
     for i in range(0, len(tokens), window_len - overlap_len):
         toks = tokens[i : i + window_len]
         chunk = tokenizer.decode(toks)
+        chunk_metadata = metadata.copy()
+        chunk_metadata["document_id"] = document_id
+        chunk_metadata["token_count"] = len(toks)
+        chunk_metadata["metadata_token_count"] = len(metadata_tokens)
+
         # chunk is a string
         chunks.append(
             Chunk(
                 content=chunk,
-                metadata={
-                    "token_count": len(toks),
-                    "document_id": document_id,
-                },
+                metadata=chunk_metadata,
             )
         )
 
     return chunks
 
 
+def _validate_embedding(embedding: NDArray, index: int, expected_dimension: int):
+    """Helper method to validate embedding format and dimensions"""
+    if not isinstance(embedding, (list | np.ndarray)):
+        raise ValueError(f"Embedding at index {index} must be a list or numpy array, got {type(embedding)}")
+
+    if isinstance(embedding, np.ndarray):
+        if not np.issubdtype(embedding.dtype, np.number):
+            raise ValueError(f"Embedding at index {index} contains non-numeric values")
+    else:
+        if not all(isinstance(e, (float | int | np.number)) for e in embedding):
+            raise ValueError(f"Embedding at index {index} contains non-numeric values")
+
+    if len(embedding) != expected_dimension:
+        raise ValueError(f"Embedding at index {index} has dimension {len(embedding)}, expected {expected_dimension}")
+
+
 class EmbeddingIndex(ABC):
     @abstractmethod
-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
         raise NotImplementedError()
 
     @abstractmethod
-    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+        raise NotImplementedError()
+
+    @abstractmethod
+    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
         raise NotImplementedError()
 
     @abstractmethod
@@ -185,26 +213,40 @@ class VectorDBWithIndex:
 
     async def insert_chunks(
         self,
-        chunks: List[Chunk],
+        chunks: list[Chunk],
     ) -> None:
-        embeddings_response = await self.inference_api.embeddings(
-            self.vector_db.embedding_model, [x.content for x in chunks]
-        )
-        embeddings = np.array(embeddings_response.embeddings)
+        chunks_to_embed = []
+        for i, c in enumerate(chunks):
+            if c.embedding is None:
+                chunks_to_embed.append(c)
+            else:
+                _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)
 
+        if chunks_to_embed:
+            resp = await self.inference_api.embeddings(
+                self.vector_db.embedding_model,
+                [c.content for c in chunks_to_embed],
+            )
+            for c, embedding in zip(chunks_to_embed, resp.embeddings, strict=False):
+                c.embedding = embedding
+
+        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
         await self.index.add_chunks(chunks, embeddings)
 
     async def query_chunks(
         self,
         query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
     ) -> QueryChunksResponse:
         if params is None:
             params = {}
         k = params.get("max_chunks", 3)
+        mode = params.get("mode")
         score_threshold = params.get("score_threshold", 0.0)
-
-        query_str = interleaved_content_as_str(query)
-        embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_str])
-        query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
-        return await self.index.query(query_vector, k, score_threshold)
+        query_string = interleaved_content_as_str(query)
+        if mode == "keyword":
+            return await self.index.query_keyword(query_string, k, score_threshold)
+        else:
+            embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
+            query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+            return await self.index.query_vector(query_vector, k, score_threshold)
diff --git a/llama_stack/providers/utils/datasetio/pagination.py b/llama_stack/providers/utils/pagination.py
similarity index 95%
rename from llama_stack/providers/utils/datasetio/pagination.py
rename to llama_stack/providers/utils/pagination.py
index 1b693f8f5..033022491 100644
--- a/llama_stack/providers/utils/datasetio/pagination.py
+++ b/llama_stack/providers/utils/pagination.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any, Dict, List
+from typing import Any
 
 from llama_stack.apis.common.responses import PaginatedResponse
 
 
 def paginate_records(
-    records: List[Dict[str, Any]],
+    records: list[dict[str, Any]],
     start_index: int | None = None,
     limit: int | None = None,
 ) -> PaginatedResponse:
diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py
new file mode 100644
index 000000000..15354e3e2
--- /dev/null
+++ b/llama_stack/providers/utils/responses/responses_store.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.agents import (
+    Order,
+)
+from llama_stack.apis.agents.openai_responses import (
+    ListOpenAIResponseInputItem,
+    ListOpenAIResponseObject,
+    OpenAIResponseInput,
+    OpenAIResponseObject,
+    OpenAIResponseObjectWithInput,
+)
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+
+from ..sqlstore.api import ColumnDefinition, ColumnType
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+
+
+class ResponsesStore:
+    def __init__(self, sql_store_config: SqlStoreConfig):
+        if not sql_store_config:
+            sql_store_config = SqliteSqlStoreConfig(
+                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+            )
+        self.sql_store = sqlstore_impl(sql_store_config)
+
+    async def initialize(self):
+        """Create the necessary tables if they don't exist."""
+        await self.sql_store.create_table(
+            "openai_responses",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "created_at": ColumnType.INTEGER,
+                "response_object": ColumnType.JSON,
+                "model": ColumnType.STRING,
+            },
+        )
+
+    async def store_response_object(
+        self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
+    ) -> None:
+        data = response_object.model_dump()
+        data["input"] = [input_item.model_dump() for input_item in input]
+
+        await self.sql_store.insert(
+            "openai_responses",
+            {
+                "id": data["id"],
+                "created_at": data["created_at"],
+                "model": data["model"],
+                "response_object": data,
+            },
+        )
+
+    async def list_responses(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseObject:
+        """
+        List responses from the database.
+
+        :param after: The ID of the last response to return.
+        :param limit: The maximum number of responses to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the responses by.
+        """
+        # TODO: support after
+        if after:
+            raise NotImplementedError("After is not supported for SQLite")
+        if not order:
+            order = Order.desc
+
+        rows = await self.sql_store.fetch_all(
+            "openai_responses",
+            where={"model": model} if model else None,
+            order_by=[("created_at", order.value)],
+            limit=limit,
+        )
+
+        data = [OpenAIResponseObjectWithInput(**row["response_object"]) for row in rows]
+        return ListOpenAIResponseObject(
+            data=data,
+            # TODO: implement has_more
+            has_more=False,
+            first_id=data[0].id if data else "",
+            last_id=data[-1].id if data else "",
+        )
+
+    async def get_response_object(self, response_id: str) -> OpenAIResponseObjectWithInput:
+        row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id})
+        if not row:
+            raise ValueError(f"Response with id {response_id} not found") from None
+        return OpenAIResponseObjectWithInput(**row["response_object"])
+
+    async def list_response_input_items(
+        self,
+        response_id: str,
+        after: str | None = None,
+        before: str | None = None,
+        include: list[str] | None = None,
+        limit: int | None = 20,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIResponseInputItem:
+        """
+        List input items for a given response.
+
+        :param response_id: The ID of the response to retrieve input items for.
+        :param after: An item ID to list items after, used for pagination.
+        :param before: An item ID to list items before, used for pagination.
+        :param include: Additional fields to include in the response.
+        :param limit: A limit on the number of objects to be returned.
+        :param order: The order to return the input items in.
+        """
+        # TODO: support after/before pagination
+        if after or before:
+            raise NotImplementedError("After/before pagination is not supported yet")
+        if include:
+            raise NotImplementedError("Include is not supported yet")
+
+        response_with_input = await self.get_response_object(response_id)
+        input_items = response_with_input.input
+
+        if order == Order.desc:
+            input_items = list(reversed(input_items))
+
+        if limit is not None and len(input_items) > limit:
+            input_items = input_items[:limit]
+
+        return ListOpenAIResponseInputItem(data=input_items)
diff --git a/llama_stack/providers/utils/scheduler.py b/llama_stack/providers/utils/scheduler.py
index d4cffe605..845ab1f02 100644
--- a/llama_stack/providers/utils/scheduler.py
+++ b/llama_stack/providers/utils/scheduler.py
@@ -8,9 +8,10 @@ import abc
 import asyncio
 import functools
 import threading
+from collections.abc import Callable, Coroutine, Iterable
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Any, Callable, Coroutine, Dict, Iterable, Tuple, TypeAlias
+from typing import Any, TypeAlias
 
 from pydantic import BaseModel
 
@@ -38,7 +39,7 @@ class JobArtifact(BaseModel):
     name: str
     # TODO: uri should be a reference to /files API; revisit when /files is implemented
     uri: str | None = None
-    metadata: Dict[str, Any]
+    metadata: dict[str, Any]
 
 
 JobHandler = Callable[
@@ -46,7 +47,7 @@ JobHandler = Callable[
 ]
 
 
-LogMessage: TypeAlias = Tuple[datetime, str]
+LogMessage: TypeAlias = tuple[datetime, str]
 
 
 _COMPLETED_STATUSES = {JobStatus.completed, JobStatus.failed}
@@ -60,7 +61,7 @@ class Job:
         self._handler = handler
         self._artifacts: list[JobArtifact] = []
         self._logs: list[LogMessage] = []
-        self._state_transitions: list[Tuple[datetime, JobStatus]] = [(datetime.now(timezone.utc), JobStatus.new)]
+        self._state_transitions: list[tuple[datetime, JobStatus]] = [(datetime.now(timezone.utc), JobStatus.new)]
 
     @property
     def handler(self) -> JobHandler:
diff --git a/llama_stack/providers/utils/scoring/aggregation_utils.py b/llama_stack/providers/utils/scoring/aggregation_utils.py
index 7254c9433..cff9a112f 100644
--- a/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ b/llama_stack/providers/utils/scoring/aggregation_utils.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import statistics
-from typing import Any, Dict, List
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import AggregationFunctionType
 
 
-def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+def aggregate_accuracy(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
     num_correct = sum(result["score"] for result in scoring_results)
     avg_score = num_correct / len(scoring_results)
 
@@ -21,14 +21,14 @@ def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any
     }
 
 
-def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+def aggregate_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
     return {
         "average": sum(result["score"] for result in scoring_results if result["score"] is not None)
         / len([_ for _ in scoring_results if _["score"] is not None]),
     }
 
 
-def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+def aggregate_weighted_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
     return {
         "weighted_average": sum(
             result["score"] * result["weight"]
@@ -40,14 +40,14 @@ def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[
 
 
 def aggregate_categorical_count(
-    scoring_results: List[ScoringResultRow],
-) -> Dict[str, Any]:
+    scoring_results: list[ScoringResultRow],
+) -> dict[str, Any]:
     scores = [str(r["score"]) for r in scoring_results]
     unique_scores = sorted(set(scores))
     return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
 
 
-def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+def aggregate_median(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
     scores = [r["score"] for r in scoring_results if r["score"] is not None]
     median = statistics.median(scores) if scores else None
     return {"median": median}
@@ -64,8 +64,8 @@ AGGREGATION_FUNCTIONS = {
 
 
 def aggregate_metrics(
-    scoring_results: List[ScoringResultRow], metrics: List[AggregationFunctionType]
-) -> Dict[str, Any]:
+    scoring_results: list[ScoringResultRow], metrics: list[AggregationFunctionType]
+) -> dict[str, Any]:
     agg_results = {}
     for metric in metrics:
         if metric not in AGGREGATION_FUNCTIONS:
diff --git a/llama_stack/providers/utils/scoring/base_scoring_fn.py b/llama_stack/providers/utils/scoring/base_scoring_fn.py
index 834deb7e1..2fae177b7 100644
--- a/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ b/llama_stack/providers/utils/scoring/base_scoring_fn.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from llama_stack.apis.scoring import ScoringFnParams, ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFn
@@ -28,28 +28,28 @@ class BaseScoringFn(ABC):
     @abstractmethod
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         raise NotImplementedError()
 
     @abstractmethod
     async def aggregate(
         self,
-        scoring_results: List[ScoringResultRow],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
-    ) -> Dict[str, Any]:
+        scoring_results: list[ScoringResultRow],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
+    ) -> dict[str, Any]:
         raise NotImplementedError()
 
     @abstractmethod
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
-    ) -> List[ScoringResultRow]:
+        input_rows: list[dict[str, Any]],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
+    ) -> list[ScoringResultRow]:
         raise NotImplementedError()
 
 
@@ -65,7 +65,7 @@ class RegisteredBaseScoringFn(BaseScoringFn):
     def __str__(self) -> str:
         return self.__class__.__name__
 
-    def get_supported_scoring_fn_defs(self) -> List[ScoringFn]:
+    def get_supported_scoring_fn_defs(self) -> list[ScoringFn]:
         return list(self.supported_fn_defs_registry.values())
 
     def register_scoring_fn_def(self, scoring_fn: ScoringFn) -> None:
@@ -81,18 +81,18 @@ class RegisteredBaseScoringFn(BaseScoringFn):
     @abstractmethod
     async def score_row(
         self,
-        input_row: Dict[str, Any],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
+        input_row: dict[str, Any],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
     ) -> ScoringResultRow:
         raise NotImplementedError()
 
     async def aggregate(
         self,
-        scoring_results: List[ScoringResultRow],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
-    ) -> Dict[str, Any]:
+        scoring_results: list[ScoringResultRow],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
+    ) -> dict[str, Any]:
         params = self.supported_fn_defs_registry[scoring_fn_identifier].params
         if scoring_params is not None:
             if params is None:
@@ -107,8 +107,8 @@ class RegisteredBaseScoringFn(BaseScoringFn):
 
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_fn_identifier: Optional[str] = None,
-        scoring_params: Optional[ScoringFnParams] = None,
-    ) -> List[ScoringResultRow]:
+        input_rows: list[dict[str, Any]],
+        scoring_fn_identifier: str | None = None,
+        scoring_params: ScoringFnParams | None = None,
+    ) -> list[ScoringResultRow]:
         return [await self.score_row(input_row, scoring_fn_identifier, scoring_params) for input_row in input_rows]
diff --git a/llama_stack/providers/utils/scoring/basic_scoring_utils.py b/llama_stack/providers/utils/scoring/basic_scoring_utils.py
index 91abfdb2e..7372a521c 100644
--- a/llama_stack/providers/utils/scoring/basic_scoring_utils.py
+++ b/llama_stack/providers/utils/scoring/basic_scoring_utils.py
@@ -5,8 +5,8 @@
 # the root directory of this source tree.
 import contextlib
 import signal
+from collections.abc import Iterator
 from types import FrameType
-from typing import Iterator, Optional
 
 
 class TimeoutError(Exception):
@@ -15,7 +15,7 @@ class TimeoutError(Exception):
 
 @contextlib.contextmanager
 def time_limit(seconds: float) -> Iterator[None]:
-    def signal_handler(signum: int, frame: Optional[FrameType]) -> None:
+    def signal_handler(signum: int, frame: FrameType | None) -> None:
         raise TimeoutError("Timed out!")
 
     signal.setitimer(signal.ITIMER_REAL, seconds)
diff --git a/llama_stack/providers/utils/sqlstore/api.py b/llama_stack/providers/utils/sqlstore/api.py
new file mode 100644
index 000000000..ace40e4c4
--- /dev/null
+++ b/llama_stack/providers/utils/sqlstore/api.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import Mapping
+from enum import Enum
+from typing import Any, Literal, Protocol
+
+from pydantic import BaseModel
+
+
+class ColumnType(Enum):
+    INTEGER = "INTEGER"
+    STRING = "STRING"
+    TEXT = "TEXT"
+    FLOAT = "FLOAT"
+    BOOLEAN = "BOOLEAN"
+    JSON = "JSON"
+    DATETIME = "DATETIME"
+
+
+class ColumnDefinition(BaseModel):
+    type: ColumnType
+    primary_key: bool = False
+    nullable: bool = True
+    default: Any = None
+
+
+class SqlStore(Protocol):
+    """
+    A protocol for a SQL store.
+    """
+
+    async def create_table(self, table: str, schema: Mapping[str, ColumnType | ColumnDefinition]) -> None:
+        """
+        Create a table.
+        """
+        pass
+
+    async def insert(self, table: str, data: Mapping[str, Any]) -> None:
+        """
+        Insert a row into a table.
+        """
+        pass
+
+    async def fetch_all(
+        self,
+        table: str,
+        where: Mapping[str, Any] | None = None,
+        limit: int | None = None,
+        order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Fetch all rows from a table.
+        """
+        pass
+
+    async def fetch_one(
+        self,
+        table: str,
+        where: Mapping[str, Any] | None = None,
+        order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
+    ) -> dict[str, Any] | None:
+        """
+        Fetch one row from a table.
+        """
+        pass
+
+    async def update(
+        self,
+        table: str,
+        data: Mapping[str, Any],
+        where: Mapping[str, Any],
+    ) -> None:
+        """
+        Update a row in a table.
+        """
+        pass
+
+    async def delete(
+        self,
+        table: str,
+        where: Mapping[str, Any],
+    ) -> None:
+        """
+        Delete a row from a table.
+        """
+        pass
diff --git a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
new file mode 100644
index 000000000..825220679
--- /dev/null
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@@ -0,0 +1,163 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from collections.abc import Mapping
+from typing import Any, Literal
+
+from sqlalchemy import (
+    JSON,
+    Boolean,
+    Column,
+    DateTime,
+    Float,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    Text,
+    select,
+)
+from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
+
+from .api import ColumnDefinition, ColumnType, SqlStore
+from .sqlstore import SqlAlchemySqlStoreConfig
+
+TYPE_MAPPING: dict[ColumnType, Any] = {
+    ColumnType.INTEGER: Integer,
+    ColumnType.STRING: String,
+    ColumnType.FLOAT: Float,
+    ColumnType.BOOLEAN: Boolean,
+    ColumnType.DATETIME: DateTime,
+    ColumnType.TEXT: Text,
+    ColumnType.JSON: JSON,
+}
+
+
+class SqlAlchemySqlStoreImpl(SqlStore):
+    def __init__(self, config: SqlAlchemySqlStoreConfig):
+        self.config = config
+        self.async_session = async_sessionmaker(create_async_engine(config.engine_str))
+        self.metadata = MetaData()
+
+    async def create_table(
+        self,
+        table: str,
+        schema: Mapping[str, ColumnType | ColumnDefinition],
+    ) -> None:
+        if not schema:
+            raise ValueError(f"No columns defined for table '{table}'.")
+
+        sqlalchemy_columns: list[Column] = []
+
+        for col_name, col_props in schema.items():
+            col_type = None
+            is_primary_key = False
+            is_nullable = True  # Default to nullable
+
+            if isinstance(col_props, ColumnType):
+                col_type = col_props
+            elif isinstance(col_props, ColumnDefinition):
+                col_type = col_props.type
+                is_primary_key = col_props.primary_key
+                is_nullable = col_props.nullable
+
+            sqlalchemy_type = TYPE_MAPPING.get(col_type)
+            if not sqlalchemy_type:
+                raise ValueError(f"Unsupported column type '{col_type}' for column '{col_name}'.")
+
+            sqlalchemy_columns.append(
+                Column(col_name, sqlalchemy_type, primary_key=is_primary_key, nullable=is_nullable)
+            )
+
+        # Check if table already exists in metadata, otherwise define it
+        if table not in self.metadata.tables:
+            sqlalchemy_table = Table(table, self.metadata, *sqlalchemy_columns)
+        else:
+            sqlalchemy_table = self.metadata.tables[table]
+
+        # Create the table in the database if it doesn't exist
+        # checkfirst=True ensures it doesn't try to recreate if it's already there
+        engine = create_async_engine(self.config.engine_str)
+        async with engine.begin() as conn:
+            await conn.run_sync(self.metadata.create_all, tables=[sqlalchemy_table], checkfirst=True)
+
+    async def insert(self, table: str, data: Mapping[str, Any]) -> None:
+        async with self.async_session() as session:
+            await session.execute(self.metadata.tables[table].insert(), data)
+            await session.commit()
+
+    async def fetch_all(
+        self,
+        table: str,
+        where: Mapping[str, Any] | None = None,
+        limit: int | None = None,
+        order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
+    ) -> list[dict[str, Any]]:
+        async with self.async_session() as session:
+            query = select(self.metadata.tables[table])
+            if where:
+                for key, value in where.items():
+                    query = query.where(self.metadata.tables[table].c[key] == value)
+            if limit:
+                query = query.limit(limit)
+            if order_by:
+                if not isinstance(order_by, list):
+                    raise ValueError(
+                        f"order_by must be a list of tuples (column, order={['asc', 'desc']}), got {order_by}"
+                    )
+                for order in order_by:
+                    if not isinstance(order, tuple):
+                        raise ValueError(
+                            f"order_by must be a list of tuples (column, order={['asc', 'desc']}), got {order_by}"
+                        )
+                    name, order_type = order
+                    if order_type == "asc":
+                        query = query.order_by(self.metadata.tables[table].c[name].asc())
+                    elif order_type == "desc":
+                        query = query.order_by(self.metadata.tables[table].c[name].desc())
+                    else:
+                        raise ValueError(f"Invalid order '{order_type}' for column '{name}'")
+            result = await session.execute(query)
+            if result.rowcount == 0:
+                return []
+            return [dict(row._mapping) for row in result]
+
+    async def fetch_one(
+        self,
+        table: str,
+        where: Mapping[str, Any] | None = None,
+        order_by: list[tuple[str, Literal["asc", "desc"]]] | None = None,
+    ) -> dict[str, Any] | None:
+        rows = await self.fetch_all(table, where, limit=1, order_by=order_by)
+        if not rows:
+            return None
+        return rows[0]
+
+    async def update(
+        self,
+        table: str,
+        data: Mapping[str, Any],
+        where: Mapping[str, Any],
+    ) -> None:
+        if not where:
+            raise ValueError("where is required for update")
+
+        async with self.async_session() as session:
+            stmt = self.metadata.tables[table].update()
+            for key, value in where.items():
+                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+            await session.execute(stmt, data)
+            await session.commit()
+
+    async def delete(self, table: str, where: Mapping[str, Any]) -> None:
+        if not where:
+            raise ValueError("where is required for delete")
+
+        async with self.async_session() as session:
+            stmt = self.metadata.tables[table].delete()
+            for key, value in where.items():
+                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+            await session.execute(stmt)
+            await session.commit()
diff --git a/llama_stack/providers/utils/sqlstore/sqlstore.py b/llama_stack/providers/utils/sqlstore/sqlstore.py
new file mode 100644
index 000000000..3091e8f96
--- /dev/null
+++ b/llama_stack/providers/utils/sqlstore/sqlstore.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from abc import abstractmethod
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Literal
+
+from pydantic import BaseModel, Field
+
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+
+from .api import SqlStore
+
+
+class SqlStoreType(Enum):
+    sqlite = "sqlite"
+    postgres = "postgres"
+
+
+class SqlAlchemySqlStoreConfig(BaseModel):
+    @property
+    @abstractmethod
+    def engine_str(self) -> str: ...
+
+    # TODO: move this when we have a better way to specify dependencies with internal APIs
+    @property
+    def pip_packages(self) -> list[str]:
+        return ["sqlalchemy[asyncio]"]
+
+
+class SqliteSqlStoreConfig(SqlAlchemySqlStoreConfig):
+    type: Literal["sqlite"] = SqlStoreType.sqlite.value
+    db_path: str = Field(
+        default=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+        description="Database path, e.g. ~/.llama/distributions/ollama/sqlstore.db",
+    )
+
+    @property
+    def engine_str(self) -> str:
+        return "sqlite+aiosqlite:///" + Path(self.db_path).expanduser().as_posix()
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, db_name: str = "sqlstore.db"):
+        return cls(
+            type="sqlite",
+            db_path="${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
+        )
+
+    @property
+    def pip_packages(self) -> list[str]:
+        return super().pip_packages + ["aiosqlite"]
+
+
+class PostgresSqlStoreConfig(SqlAlchemySqlStoreConfig):
+    type: Literal["postgres"] = SqlStoreType.postgres.value
+    host: str = "localhost"
+    port: str = "5432"
+    db: str = "llamastack"
+    user: str
+    password: str | None = None
+
+    @property
+    def engine_str(self) -> str:
+        return f"postgresql+asyncpg://{self.user}:{self.password}@{self.host}:{self.port}/{self.db}"
+
+    @property
+    def pip_packages(self) -> list[str]:
+        return super().pip_packages + ["asyncpg"]
+
+
+SqlStoreConfig = Annotated[
+    SqliteSqlStoreConfig | PostgresSqlStoreConfig,
+    Field(discriminator="type", default=SqlStoreType.sqlite.value),
+]
+
+
+def sqlstore_impl(config: SqlStoreConfig) -> SqlStore:
+    if config.type in [SqlStoreType.sqlite.value, SqlStoreType.postgres.value]:
+        from .sqlalchemy_sqlstore import SqlAlchemySqlStoreImpl
+
+        impl = SqlAlchemySqlStoreImpl(config)
+    else:
+        raise ValueError(f"Unknown sqlstore type {config.type}")
+
+    return impl
diff --git a/llama_stack/providers/utils/telemetry/dataset_mixin.py b/llama_stack/providers/utils/telemetry/dataset_mixin.py
index 34c612133..fe729a244 100644
--- a/llama_stack/providers/utils/telemetry/dataset_mixin.py
+++ b/llama_stack/providers/utils/telemetry/dataset_mixin.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Optional
 
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.telemetry import QueryCondition, QuerySpansResponse, Span
@@ -17,10 +16,10 @@ class TelemetryDatasetMixin:
 
     async def save_spans_to_dataset(
         self,
-        attribute_filters: List[QueryCondition],
-        attributes_to_save: List[str],
+        attribute_filters: list[QueryCondition],
+        attributes_to_save: list[str],
         dataset_id: str,
-        max_depth: Optional[int] = None,
+        max_depth: int | None = None,
     ) -> None:
         if self.datasetio_api is None:
             raise RuntimeError("DatasetIO API not available")
@@ -48,9 +47,9 @@ class TelemetryDatasetMixin:
 
     async def query_spans(
         self,
-        attribute_filters: List[QueryCondition],
-        attributes_to_return: List[str],
-        max_depth: Optional[int] = None,
+        attribute_filters: list[QueryCondition],
+        attributes_to_return: list[str],
+        max_depth: int | None = None,
     ) -> QuerySpansResponse:
         traces = await self.query_traces(attribute_filters=attribute_filters)
         spans = []
diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
index 3248f3fa7..af1145fe7 100644
--- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
+++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
@@ -6,7 +6,7 @@
 
 import json
 from datetime import datetime
-from typing import Dict, List, Optional, Protocol
+from typing import Protocol
 
 import aiosqlite
 
@@ -16,18 +16,18 @@ from llama_stack.apis.telemetry import QueryCondition, Span, SpanWithStatus, Tra
 class TraceStore(Protocol):
     async def query_traces(
         self,
-        attribute_filters: Optional[List[QueryCondition]] = None,
-        limit: Optional[int] = 100,
-        offset: Optional[int] = 0,
-        order_by: Optional[List[str]] = None,
-    ) -> List[Trace]: ...
+        attribute_filters: list[QueryCondition] | None = None,
+        limit: int | None = 100,
+        offset: int | None = 0,
+        order_by: list[str] | None = None,
+    ) -> list[Trace]: ...
 
     async def get_span_tree(
         self,
         span_id: str,
-        attributes_to_return: Optional[List[str]] = None,
-        max_depth: Optional[int] = None,
-    ) -> Dict[str, SpanWithStatus]: ...
+        attributes_to_return: list[str] | None = None,
+        max_depth: int | None = None,
+    ) -> dict[str, SpanWithStatus]: ...
 
 
 class SQLiteTraceStore(TraceStore):
@@ -36,11 +36,11 @@ class SQLiteTraceStore(TraceStore):
 
     async def query_traces(
         self,
-        attribute_filters: Optional[List[QueryCondition]] = None,
-        limit: Optional[int] = 100,
-        offset: Optional[int] = 0,
-        order_by: Optional[List[str]] = None,
-    ) -> List[Trace]:
+        attribute_filters: list[QueryCondition] | None = None,
+        limit: int | None = 100,
+        offset: int | None = 0,
+        order_by: list[str] | None = None,
+    ) -> list[Trace]:
         def build_where_clause() -> tuple[str, list]:
             if not attribute_filters:
                 return "", []
@@ -112,9 +112,9 @@ class SQLiteTraceStore(TraceStore):
     async def get_span_tree(
         self,
         span_id: str,
-        attributes_to_return: Optional[List[str]] = None,
-        max_depth: Optional[int] = None,
-    ) -> Dict[str, SpanWithStatus]:
+        attributes_to_return: list[str] | None = None,
+        max_depth: int | None = None,
+    ) -> dict[str, SpanWithStatus]:
         # Build the attributes selection
         attributes_select = "s.attributes"
         if attributes_to_return:
diff --git a/llama_stack/providers/utils/telemetry/trace_protocol.py b/llama_stack/providers/utils/telemetry/trace_protocol.py
index 525ade74d..eb6d8b331 100644
--- a/llama_stack/providers/utils/telemetry/trace_protocol.py
+++ b/llama_stack/providers/utils/telemetry/trace_protocol.py
@@ -7,8 +7,9 @@
 import asyncio
 import inspect
 import json
+from collections.abc import AsyncGenerator, Callable
 from functools import wraps
-from typing import Any, AsyncGenerator, Callable, Type, TypeVar
+from typing import Any, TypeVar
 
 from pydantic import BaseModel
 
@@ -25,13 +26,13 @@ def _prepare_for_json(value: Any) -> str:
     """Serialize a single value into JSON-compatible format."""
     if value is None:
         return ""
-    elif isinstance(value, (str, int, float, bool)):
+    elif isinstance(value, str | int | float | bool):
         return value
     elif hasattr(value, "_name_"):
         return value._name_
     elif isinstance(value, BaseModel):
         return json.loads(value.model_dump_json())
-    elif isinstance(value, (list, tuple, set)):
+    elif isinstance(value, list | tuple | set):
         return [_prepare_for_json(item) for item in value]
     elif isinstance(value, dict):
         return {str(k): _prepare_for_json(v) for k, v in value.items()}
@@ -43,7 +44,7 @@ def _prepare_for_json(value: Any) -> str:
             return str(value)
 
 
-def trace_protocol(cls: Type[T]) -> Type[T]:
+def trace_protocol(cls: type[T]) -> type[T]:
     """
     A class decorator that automatically traces all methods in a protocol/base class
     and its inheriting classes.
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 3d5c717d6..4edfa6516 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -10,9 +10,10 @@ import logging
 import queue
 import random
 import threading
+from collections.abc import Callable
 from datetime import datetime, timezone
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any
 
 from llama_stack.apis.telemetry import (
     LogSeverity,
@@ -33,6 +34,8 @@ logger = get_logger(__name__, category="core")
 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000
 
+ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
+
 
 def trace_id_to_str(trace_id: int) -> str:
     """Convenience trace ID formatting method
@@ -106,13 +109,13 @@ class BackgroundLogger:
 
 
 class TraceContext:
-    spans: List[Span] = []
+    spans: list[Span] = []
 
     def __init__(self, logger: BackgroundLogger, trace_id: str):
         self.logger = logger
         self.trace_id = trace_id
 
-    def push_span(self, name: str, attributes: Dict[str, Any] = None) -> Span:
+    def push_span(self, name: str, attributes: dict[str, Any] = None) -> Span:
         current_span = self.get_current_span()
         span = Span(
             span_id=generate_span_id(),
@@ -168,7 +171,7 @@ def setup_logger(api: Telemetry, level: int = logging.INFO):
     root_logger.addHandler(TelemetryHandler())
 
 
-async def start_trace(name: str, attributes: Dict[str, Any] = None) -> TraceContext:
+async def start_trace(name: str, attributes: dict[str, Any] = None) -> TraceContext:
     global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
 
     if BACKGROUND_LOGGER is None:
@@ -177,7 +180,8 @@ async def start_trace(name: str, attributes: Dict[str, Any] = None) -> TraceCont
 
     trace_id = generate_trace_id()
     context = TraceContext(BACKGROUND_LOGGER, trace_id)
-    context.push_span(name, {"__root__": True, **(attributes or {})})
+    attributes = {marker: True for marker in ROOT_SPAN_MARKERS} | (attributes or {})
+    context.push_span(name, attributes)
 
     CURRENT_TRACE_CONTEXT.set(context)
     return context
@@ -246,7 +250,7 @@ class TelemetryHandler(logging.Handler):
 
 
 class SpanContextManager:
-    def __init__(self, name: str, attributes: Dict[str, Any] = None):
+    def __init__(self, name: str, attributes: dict[str, Any] = None):
         self.name = name
         self.attributes = attributes
         self.span = None
@@ -316,11 +320,11 @@ class SpanContextManager:
         return wrapper
 
 
-def span(name: str, attributes: Dict[str, Any] = None):
+def span(name: str, attributes: dict[str, Any] = None):
     return SpanContextManager(name, attributes)
 
 
-def get_current_span() -> Optional[Span]:
+def get_current_span() -> Span | None:
     global CURRENT_TRACE_CONTEXT
     if CURRENT_TRACE_CONTEXT is None:
         logger.debug("No trace context to get current span")
diff --git a/llama_stack/providers/utils/tools/mcp.py b/llama_stack/providers/utils/tools/mcp.py
new file mode 100644
index 000000000..f024693a0
--- /dev/null
+++ b/llama_stack/providers/utils/tools/mcp.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from contextlib import asynccontextmanager
+from typing import Any
+
+try:
+    # for python < 3.11
+    import exceptiongroup
+
+    BaseExceptionGroup = exceptiongroup.BaseExceptionGroup
+except ImportError:
+    pass
+
+import httpx
+from mcp import ClientSession
+from mcp import types as mcp_types
+from mcp.client.sse import sse_client
+
+from llama_stack.apis.common.content_types import ImageContentItem, InterleavedContentItem, TextContentItem
+from llama_stack.apis.tools import (
+    ListToolDefsResponse,
+    ToolDef,
+    ToolInvocationResult,
+    ToolParameter,
+)
+from llama_stack.distribution.datatypes import AuthenticationRequiredError
+from llama_stack.log import get_logger
+
+logger = get_logger(__name__, category="tools")
+
+
+@asynccontextmanager
+async def sse_client_wrapper(endpoint: str, headers: dict[str, str]):
+    try:
+        async with sse_client(endpoint, headers=headers) as streams:
+            async with ClientSession(*streams) as session:
+                await session.initialize()
+                yield session
+    except BaseException as e:
+        if isinstance(e, BaseExceptionGroup):
+            for exc in e.exceptions:
+                if isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code == 401:
+                    raise AuthenticationRequiredError(exc) from exc
+        elif isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 401:
+            raise AuthenticationRequiredError(e) from e
+
+        raise
+
+
+async def list_mcp_tools(endpoint: str, headers: dict[str, str]) -> ListToolDefsResponse:
+    tools = []
+    async with sse_client_wrapper(endpoint, headers) as session:
+        tools_result = await session.list_tools()
+        for tool in tools_result.tools:
+            parameters = []
+            for param_name, param_schema in tool.inputSchema.get("properties", {}).items():
+                parameters.append(
+                    ToolParameter(
+                        name=param_name,
+                        parameter_type=param_schema.get("type", "string"),
+                        description=param_schema.get("description", ""),
+                    )
+                )
+            tools.append(
+                ToolDef(
+                    name=tool.name,
+                    description=tool.description,
+                    parameters=parameters,
+                    metadata={
+                        "endpoint": endpoint,
+                    },
+                )
+            )
+    return ListToolDefsResponse(data=tools)
+
+
+async def invoke_mcp_tool(
+    endpoint: str, headers: dict[str, str], tool_name: str, kwargs: dict[str, Any]
+) -> ToolInvocationResult:
+    async with sse_client_wrapper(endpoint, headers) as session:
+        result = await session.call_tool(tool_name, kwargs)
+
+        content: list[InterleavedContentItem] = []
+        for item in result.content:
+            if isinstance(item, mcp_types.TextContent):
+                content.append(TextContentItem(text=item.text))
+            elif isinstance(item, mcp_types.ImageContent):
+                content.append(ImageContentItem(image=item.data))
+            elif isinstance(item, mcp_types.EmbeddedResource):
+                logger.warning(f"EmbeddedResource is not supported: {item}")
+            else:
+                raise ValueError(f"Unknown content type: {type(item)}")
+        return ToolInvocationResult(
+            content=content,
+            error_code=1 if result.isError else 0,
+        )
diff --git a/llama_stack/schema_utils.py b/llama_stack/schema_utils.py
index 8143f1224..694de333e 100644
--- a/llama_stack/schema_utils.py
+++ b/llama_stack/schema_utils.py
@@ -4,37 +4,38 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, List, Optional, TypeVar
+from typing import Any, TypeVar
 
 from .strong_typing.schema import json_schema_type, register_schema  # noqa: F401
 
 
 @dataclass
 class WebMethod:
-    route: Optional[str] = None
+    route: str | None = None
     public: bool = False
-    request_examples: Optional[List[Any]] = None
-    response_examples: Optional[List[Any]] = None
-    method: Optional[str] = None
-    raw_bytes_request_body: Optional[bool] = False
+    request_examples: list[Any] | None = None
+    response_examples: list[Any] | None = None
+    method: str | None = None
+    raw_bytes_request_body: bool | None = False
     # A descriptive name of the corresponding span created by tracing
-    descriptive_name: Optional[str] = None
-    experimental: Optional[bool] = False
+    descriptive_name: str | None = None
+    experimental: bool | None = False
 
 
 T = TypeVar("T", bound=Callable[..., Any])
 
 
 def webmethod(
-    route: Optional[str] = None,
-    method: Optional[str] = None,
-    public: Optional[bool] = False,
-    request_examples: Optional[List[Any]] = None,
-    response_examples: Optional[List[Any]] = None,
-    raw_bytes_request_body: Optional[bool] = False,
-    descriptive_name: Optional[str] = None,
-    experimental: Optional[bool] = False,
+    route: str | None = None,
+    method: str | None = None,
+    public: bool | None = False,
+    request_examples: list[Any] | None = None,
+    response_examples: list[Any] | None = None,
+    raw_bytes_request_body: bool | None = False,
+    descriptive_name: str | None = None,
+    experimental: bool | None = False,
 ) -> Callable[[T], T]:
     """
     Decorator that supplies additional metadata to an endpoint operation function.
diff --git a/llama_stack/strong_typing/docstring.py b/llama_stack/strong_typing/docstring.py
index b038d1024..497c9ea82 100644
--- a/llama_stack/strong_typing/docstring.py
+++ b/llama_stack/strong_typing/docstring.py
@@ -11,6 +11,7 @@ Type-safe data interchange for Python data classes.
 """
 
 import builtins
+import collections.abc
 import dataclasses
 import inspect
 import re
@@ -171,6 +172,13 @@ class SupportsDoc(Protocol):
     __doc__: Optional[str]
 
 
+def _maybe_unwrap_async_iterator(t):
+    origin_type = typing.get_origin(t)
+    if origin_type is collections.abc.AsyncIterator:
+        return typing.get_args(t)[0]
+    return t
+
+
 def parse_type(typ: SupportsDoc) -> Docstring:
     """
     Parse the docstring of a type into its components.
@@ -178,6 +186,8 @@ def parse_type(typ: SupportsDoc) -> Docstring:
     :param typ: The type whose documentation string to parse.
     :returns: Components of the documentation string.
     """
+    # Use docstring from the iterator origin type for streaming apis
+    typ = _maybe_unwrap_async_iterator(typ)
 
     doc = get_docstring(typ)
     if doc is None:
diff --git a/llama_stack/strong_typing/schema.py b/llama_stack/strong_typing/schema.py
index 0f5121906..82baddc86 100644
--- a/llama_stack/strong_typing/schema.py
+++ b/llama_stack/strong_typing/schema.py
@@ -10,6 +10,7 @@ Type-safe data interchange for Python data classes.
 :see: https://github.com/hunyadi/strong_typing
 """
 
+import collections.abc
 import dataclasses
 import datetime
 import decimal
@@ -478,6 +479,8 @@ class JsonSchemaGenerator:
                 }
             return ret
         elif origin_type is Literal:
+            if len(typing.get_args(typ)) != 1:
+                raise ValueError(f"Literal type {typ} has {len(typing.get_args(typ))} arguments")
             (literal_value,) = typing.get_args(typ)  # unpack value of literal type
             schema = self.type_to_schema(type(literal_value))
             schema["const"] = literal_value
@@ -485,6 +488,9 @@ class JsonSchemaGenerator:
         elif origin_type is type:
             (concrete_type,) = typing.get_args(typ)  # unpack single tuple element
             return {"const": self.type_to_schema(concrete_type, force_expand=True)}
+        elif origin_type is collections.abc.AsyncIterator:
+            (concrete_type,) = typing.get_args(typ)
+            return self.type_to_schema(concrete_type)
 
         # dictionary of class attributes
         members = dict(inspect.getmembers(typ, lambda a: not inspect.isroutine(a)))
diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py
index f82defb4b..bc3a9304f 100644
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@@ -29,7 +29,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -55,10 +54,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml
index 6c07b0478..97a06f77a 100644
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@@ -26,7 +26,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index fe21d4bef..a58068a60 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -35,13 +35,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/bedrock/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -87,9 +90,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -99,6 +99,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/inference_store.db
 models:
 - metadata: {}
   model_id: meta.llama3-1-8b-instruct-v1:0
@@ -140,7 +143,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml
index ef6c43212..f26f4ed9b 100644
--- a/llama_stack/templates/cerebras/build.yaml
+++ b/llama_stack/templates/cerebras/build.yaml
@@ -27,6 +27,8 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py
index c370fb7d0..d891502d8 100644
--- a/llama_stack/templates/cerebras/cerebras.py
+++ b/llama_stack/templates/cerebras/cerebras.py
@@ -34,7 +34,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
         ],
     }
@@ -77,10 +76,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/cerebras/doc_template.md b/llama_stack/templates/cerebras/doc_template.md
index 76f8c34ad..5cae2b2da 100644
--- a/llama_stack/templates/cerebras/doc_template.md
+++ b/llama_stack/templates/cerebras/doc_template.md
@@ -46,7 +46,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
diff --git a/llama_stack/templates/cerebras/report.md b/llama_stack/templates/cerebras/report.md
deleted file mode 100644
index 7c09474b1..000000000
--- a/llama_stack/templates/cerebras/report.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Report for cerebras distribution
-
-## Supported Models
-| Model Descriptor | cerebras |
-|:---|:---|
-| meta-llama/Llama-3-8B-Instruct | ❌ |
-| meta-llama/Llama-3-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-8B-Instruct | ✅ |
-| meta-llama/Llama-3.1-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ |
-| meta-llama/Llama-3.2-1B-Instruct | ❌ |
-| meta-llama/Llama-3.2-3B-Instruct | ❌ |
-| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct | ✅ |
-| meta-llama/Llama-Guard-3-11B-Vision | ❌ |
-| meta-llama/Llama-Guard-3-1B | ❌ |
-| meta-llama/Llama-Guard-3-8B | ❌ |
-| meta-llama/Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ❌ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ❌ |
-| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index dc7ee4729..c080536b7 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -41,6 +41,9 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/responses_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -79,9 +82,9 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/cerebras/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/trace_store.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -93,15 +96,15 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/inference_store.db
 models:
 - metadata: {}
   model_id: llama3.1-8b
@@ -138,7 +141,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/ci-tests/build.yaml b/llama_stack/templates/ci-tests/build.yaml
index a5c615f2f..9f4fbbdda 100644
--- a/llama_stack/templates/ci-tests/build.yaml
+++ b/llama_stack/templates/ci-tests/build.yaml
@@ -27,7 +27,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/ci-tests/ci_tests.py b/llama_stack/templates/ci-tests/ci_tests.py
index f6e836918..afa8a23ce 100644
--- a/llama_stack/templates/ci-tests/ci_tests.py
+++ b/llama_stack/templates/ci-tests/ci_tests.py
@@ -40,7 +40,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -71,10 +70,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
     available_models = {
         "fireworks": MODEL_ENTRIES,
diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml
index 3c16dd5ea..368187d3a 100644
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@@ -38,13 +38,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ci-tests/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -90,9 +93,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -102,6 +102,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/inference_store.db
 models:
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
@@ -236,7 +239,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/dell/build.yaml b/llama_stack/templates/dell/build.yaml
index 05b98d56f..513df16c1 100644
--- a/llama_stack/templates/dell/build.yaml
+++ b/llama_stack/templates/dell/build.yaml
@@ -28,6 +28,8 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/dell/dell.py b/llama_stack/templates/dell/dell.py
index 52c5a5476..a7ec5f3b8 100644
--- a/llama_stack/templates/dell/dell.py
+++ b/llama_stack/templates/dell/dell.py
@@ -30,7 +30,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
         ],
     }
@@ -87,10 +86,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/dell/doc_template.md b/llama_stack/templates/dell/doc_template.md
index 26f07130b..6bdd7f81c 100644
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@@ -143,7 +143,7 @@ docker run \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env DEH_URL=$DEH_URL \
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index 802c56aad..5c6072245 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,15 +96,15 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -127,7 +130,5 @@ tool_groups:
   provider_id: brave-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 4a2d819a9..ffaa0bf2f 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -37,13 +37,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -89,15 +92,15 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -118,7 +121,5 @@ tool_groups:
   provider_id: brave-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json
index 4c16411f0..47a35edc0 100644
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@@ -31,6 +31,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -67,6 +68,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -105,6 +107,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "sqlite-vec",
     "tqdm",
     "transformers",
@@ -145,46 +148,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
-    "tqdm",
-    "transformers",
-    "tree_sitter",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "dev": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "emoji",
-    "fastapi",
-    "fire",
-    "fireworks-ai",
-    "httpx",
-    "langdetect",
-    "litellm",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pymongo",
-    "pypdf",
-    "pythainlp",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "sqlite-vec",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -224,6 +188,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -261,6 +226,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -299,6 +265,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -337,6 +304,86 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "kvant": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "emoji",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "langdetect",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlalchemy[asyncio]",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "llama_api": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "emoji",
+    "fastapi",
+    "fire",
+    "httpx",
+    "langdetect",
+    "litellm",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlalchemy[asyncio]",
+    "sqlite-vec",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -380,6 +427,7 @@
     "scipy",
     "sentence-transformers",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "torch",
     "torchao==0.8.0",
     "torchvision",
@@ -394,6 +442,7 @@
     "aiosqlite",
     "blobfile",
     "chardet",
+    "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
@@ -414,6 +463,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "uvicorn"
@@ -441,6 +491,7 @@
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
     "pandas",
+    "peft",
     "pillow",
     "psycopg2-binary",
     "pymongo",
@@ -451,9 +502,12 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
+    "torch",
     "tqdm",
     "transformers",
     "tree_sitter",
+    "trl",
     "uvicorn"
   ],
   "open-benchmark": [
@@ -487,6 +541,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "sqlite-vec",
     "together",
     "tqdm",
@@ -525,6 +580,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -563,6 +619,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -579,10 +636,11 @@
     "fastapi",
     "fire",
     "httpx",
+    "litellm",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
-    "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
     "pandas",
@@ -595,9 +653,53 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
-    "uvicorn"
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "starter": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "emoji",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "langdetect",
+    "litellm",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlalchemy[asyncio]",
+    "sqlite-vec",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
   "tgi": [
     "aiohttp",
@@ -632,6 +734,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -670,6 +773,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "together",
     "tqdm",
     "transformers",
@@ -709,6 +813,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "sqlite-vec",
     "tqdm",
     "transformers",
@@ -748,6 +853,7 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
@@ -787,9 +893,12 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "sqlalchemy[asyncio]",
     "tqdm",
     "transformers",
     "tree_sitter",
-    "uvicorn"
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/llama_stack/templates/experimental-post-training/build.yaml b/llama_stack/templates/experimental-post-training/build.yaml
index b4b5e2203..55cd189c6 100644
--- a/llama_stack/templates/experimental-post-training/build.yaml
+++ b/llama_stack/templates/experimental-post-training/build.yaml
@@ -13,9 +13,10 @@ distribution_spec:
     - inline::basic
     - inline::braintrust
     post_training:
-    - inline::torchtune
+    - inline::huggingface
     datasetio:
     - inline::localfs
+    - remote::huggingface
     telemetry:
     - inline::meta-reference
     agents:
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 2ebdfe1aa..393cba41d 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -49,16 +49,24 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/localfs_datasetio.db
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/huggingface}/huggingface_datasetio.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config: {}
   post_training:
-  - provider_id: torchtune-post-training
-    provider_type: inline::torchtune
-    config: {
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
       checkpoint_format: huggingface
-    }
+      distributed_backend: null
+      device: cpu
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml
index 3907eba78..be19181c0 100644
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@@ -28,7 +28,9 @@ distribution_spec:
     - remote::brave-search
     - remote::tavily-search
     - remote::wolfram-alpha
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py
index 449f18bf7..da68475e2 100644
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@@ -40,7 +40,6 @@ def get_distribution_template() -> DistributionTemplate:
             "remote::brave-search",
             "remote::tavily-search",
             "remote::wolfram-alpha",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -90,10 +89,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/fireworks/report.md b/llama_stack/templates/fireworks/report.md
deleted file mode 100644
index 2c1ccc943..000000000
--- a/llama_stack/templates/fireworks/report.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Report for fireworks distribution
-
-## Supported Models
-| Model Descriptor | fireworks |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ❌ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
-| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
-| inline::meta-reference | /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index aa6209db6..41500f6f6 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -102,9 +105,6 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -114,6 +114,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/inference_store.db
 models:
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
@@ -255,7 +258,5 @@ tool_groups:
   provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index 834ec8260..b1fa03306 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -97,9 +100,6 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -109,6 +109,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/inference_store.db
 models:
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
@@ -245,7 +248,5 @@ tool_groups:
   provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/groq/build.yaml b/llama_stack/templates/groq/build.yaml
index 3263ce83b..819df22f0 100644
--- a/llama_stack/templates/groq/build.yaml
+++ b/llama_stack/templates/groq/build.yaml
@@ -24,6 +24,8 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/groq/groq.py b/llama_stack/templates/groq/groq.py
index 7999f95cb..4e52aa42d 100644
--- a/llama_stack/templates/groq/groq.py
+++ b/llama_stack/templates/groq/groq.py
@@ -33,7 +33,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
         ],
     }
@@ -72,10 +71,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml
index 444452dcb..db7ebffee 100644
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/groq/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,15 +96,15 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/inference_store.db
 models:
 - metadata: {}
   model_id: groq/llama3-8b-8192
@@ -203,7 +206,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml
index c2eaaa05b..8ede83694 100644
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@@ -26,7 +26,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py
index 53dc9d38f..69e037299 100644
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@@ -32,7 +32,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -79,10 +78,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 14753e08b..15cf2a47f 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -98,9 +101,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -110,6 +110,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -135,7 +138,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 706ba9122..428edf9a2 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,9 +96,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -105,6 +105,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -125,7 +128,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml
index c0cc1e2c2..d0752db9a 100644
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@@ -27,7 +27,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py
index ad8a72012..ecfe2a167 100644
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@@ -32,7 +32,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -80,10 +79,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index bf26fe507..ab461c6c3 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -98,9 +101,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -110,6 +110,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -135,7 +138,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index cc973b8de..d238506fb 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,9 +96,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -105,6 +105,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -125,7 +128,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/kvant/__init__.py b/llama_stack/templates/kvant/__init__.py
new file mode 100644
index 000000000..61706f7f6
--- /dev/null
+++ b/llama_stack/templates/kvant/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .kvant import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/kvant/build.yaml b/llama_stack/templates/kvant/build.yaml
new file mode 100644
index 000000000..25afc1f4d
--- /dev/null
+++ b/llama_stack/templates/kvant/build.yaml
@@ -0,0 +1,35 @@
+version: '2'
+distribution_spec:
+  description: distribution for kvant cloud
+  providers:
+    inference:
+    - remote::vllm
+    - inline::sentence-transformers
+    vector_io:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - remote::wolfram-alpha
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
+additional_pip_packages:
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/kvant/kvant.py b/llama_stack/templates/kvant/kvant.py
new file mode 100644
index 000000000..44cfc7016
--- /dev/null
+++ b/llama_stack/templates/kvant/kvant.py
@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.remote.inference.passthrough.config import (
+    PassthroughImplConfig,
+)
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::openai", "inline::sentence-transformers"],
+        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "remote::wolfram-alpha",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+
+    name = "kvant"
+
+    inference_provider = Provider(
+        provider_id="openai",
+        provider_type="remote::openai",
+        config=PassthroughImplConfig.sample_run_config(),
+    )
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    vector_io_provider = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+
+    default_models = [
+        ModelInput(
+            metadata={},
+            model_id="inference-llama4-maverick",
+            provider_id="openai",
+            provider_model_id="inference-llama4-maverick",
+            model_type=ModelType.llm,
+        ),
+    ]
+
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+    ]
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Use Passthrough hosted llama-stack endpoint for LLM inference",
+        container_image=None,
+        providers=providers,
+        available_models_by_provider={
+            "openai": [
+                ProviderModelEntry(
+                    provider_model_id="inference-llama4-maverick",
+                    model_type=ModelType.llm,
+                ),
+            ],
+        },
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider, embedding_provider],
+                    "vector_io": [vector_io_provider],
+                },
+                default_models=default_models + [embedding_model],
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+            "OPENAI_API_KEY": (
+                "",
+                "kvant maas API Key",
+            ),
+            "OPENAI_BASE_URL": (
+                "https://maas.kvant.cloud",
+                "kvant maas URL",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/kvant/run.yaml b/llama_stack/templates/kvant/run.yaml
new file mode 100644
index 000000000..99fb6f7fa
--- /dev/null
+++ b/llama_stack/templates/kvant/run.yaml
@@ -0,0 +1,170 @@
+version: '2'
+image_name: kvant
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: kvant
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:https://maas.ai-2.kvant.cloud/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:400000}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/responses_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/trace_store.db
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/kvant}/inference_store.db
+models:
+- metadata: {}
+  model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  provider_id: kvant
+  provider_model_id: inference-llama4-maverick
+  model_type: llm
+- metadata:
+    embedding_dimension: 1024
+    context_length: 8192
+  model_id: inference-bge-m3
+  provider_id: kvant
+  model_type: embedding
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+# - vector_db_id: test-bge
+#   embedding_model: inference-bge-m3
+#   embedding_dimension: 1024
+#   provider_id: faiss
+# - vector_db_id: test-MiniLM-L6-v2
+#   embedding_model: all-MiniLM-L6-v2
+#   embedding_dimension: 384
+#   provider_id: faiss
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
+  auth:
+    provider_type: "oauth2_token"
+    config:
+      jwks:
+      introspection:
+        url: ${env.KEYCLOAK_INSTROSPECT:https://iam.phoenix-systems.ch/realms/kvant/protocol/openid-connect/token/introspect} 
+        client_id: ${env.KEYCLOAK_CLIENT_ID:llama-stack}
+        client_secret: ${env.KEYCLOAK_CLIENT_SECRET}
+      claims_mapping:
+        sub: projects
+        scope: roles
+        #groups: teams
+        customer/id: teams
+        aud: namespaces
diff --git a/llama_stack/templates/llama_api/__init__.py b/llama_stack/templates/llama_api/__init__.py
new file mode 100644
index 000000000..57cc75730
--- /dev/null
+++ b/llama_stack/templates/llama_api/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .llama_api import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/dev/build.yaml b/llama_stack/templates/llama_api/build.yaml
similarity index 83%
rename from llama_stack/templates/dev/build.yaml
rename to llama_stack/templates/llama_api/build.yaml
index 726ebccca..857e5f014 100644
--- a/llama_stack/templates/dev/build.yaml
+++ b/llama_stack/templates/llama_api/build.yaml
@@ -3,11 +3,7 @@ distribution_spec:
   description: Distribution for running e2e tests in CI
   providers:
     inference:
-    - remote::openai
-    - remote::fireworks
-    - remote::anthropic
-    - remote::gemini
-    - remote::groq
+    - remote::llama-openai-compat
     - inline::sentence-transformers
     vector_io:
     - inline::sqlite-vec
@@ -31,7 +27,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/llama_api/llama_api.py b/llama_stack/templates/llama_api/llama_api.py
new file mode 100644
index 000000000..b4641b9da
--- /dev/null
+++ b/llama_stack/templates/llama_api/llama_api.py
@@ -0,0 +1,153 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
+from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
+    SQLiteVectorIOConfig,
+)
+from llama_stack.providers.remote.inference.llama_openai_compat.config import (
+    LlamaCompatConfig,
+)
+from llama_stack.providers.remote.inference.llama_openai_compat.models import (
+    MODEL_ENTRIES as LLLAMA_MODEL_ENTRIES,
+)
+from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
+from llama_stack.providers.remote.vector_io.pgvector.config import (
+    PGVectorVectorIOConfig,
+)
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+    get_model_registry,
+)
+
+
+def get_inference_providers() -> tuple[list[Provider], list[ModelInput]]:
+    # in this template, we allow each API key to be optional
+    providers = [
+        (
+            "llama-openai-compat",
+            LLLAMA_MODEL_ENTRIES,
+            LlamaCompatConfig.sample_run_config(api_key="${env.LLAMA_API_KEY:}"),
+        ),
+    ]
+    inference_providers = []
+    available_models = {}
+    for provider_id, model_entries, config in providers:
+        inference_providers.append(
+            Provider(
+                provider_id=provider_id,
+                provider_type=f"remote::{provider_id}",
+                config=config,
+            )
+        )
+        available_models[provider_id] = model_entries
+    return inference_providers, available_models
+
+
+def get_distribution_template() -> DistributionTemplate:
+    inference_providers, available_models = get_inference_providers()
+    providers = {
+        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
+        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "llama_api"
+
+    vector_io_providers = [
+        Provider(
+            provider_id="sqlite-vec",
+            provider_type="inline::sqlite-vec",
+            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_CHROMADB+chromadb}",
+            provider_type="remote::chromadb",
+            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_PGVECTOR+pgvector}",
+            provider_type="remote::pgvector",
+            config=PGVectorVectorIOConfig.sample_run_config(
+                db="${env.PGVECTOR_DB:}",
+                user="${env.PGVECTOR_USER:}",
+                password="${env.PGVECTOR_PASSWORD:}",
+            ),
+        ),
+    ]
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+    ]
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id=embedding_provider.provider_id,
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+
+    default_models = get_model_registry(available_models)
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Distribution for running e2e tests in CI",
+        container_image=None,
+        template_path=None,
+        providers=providers,
+        available_models_by_provider=available_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": inference_providers + [embedding_provider],
+                    "vector_io": vector_io_providers,
+                },
+                default_models=default_models + [embedding_model],
+                default_tool_groups=default_tool_groups,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/llama_api/run.yaml b/llama_stack/templates/llama_api/run.yaml
new file mode 100644
index 000000000..a7f2b0769
--- /dev/null
+++ b/llama_stack/templates/llama_api/run.yaml
@@ -0,0 +1,168 @@
+version: '2'
+image_name: llama_api
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: llama-openai-compat
+    provider_type: remote::llama-openai-compat
+    config:
+      openai_compat_api_base: https://api.llama.com/compat/v1/
+      api_key: ${env.LLAMA_API_KEY:}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/sqlite_vec.db
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:localhost}
+      port: ${env.PGVECTOR_PORT:5432}
+      db: ${env.PGVECTOR_DB:}
+      user: ${env.PGVECTOR_USER:}
+      password: ${env.PGVECTOR_PASSWORD:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/responses_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/trace_store.db
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/llama_api}/inference_store.db
+models:
+- metadata: {}
+  model_id: Llama-3.3-70B-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: llama-openai-compat
+  provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml
index b9130fc7d..53ad411e3 100644
--- a/llama_stack/templates/meta-reference-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@@ -26,7 +26,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py
index 8ba9fadca..95d126095 100644
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@@ -36,7 +36,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -86,10 +85,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index 63177ab09..2b751a514 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -56,13 +56,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -108,9 +111,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -120,6 +120,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -145,7 +148,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 380d83060..a24c5fec5 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -98,9 +101,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -110,6 +110,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -130,7 +133,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml
index a33fa3737..6bd8a0100 100644
--- a/llama_stack/templates/nvidia/build.yaml
+++ b/llama_stack/templates/nvidia/build.yaml
@@ -18,8 +18,12 @@ distribution_spec:
     - remote::nvidia
     datasetio:
     - inline::localfs
+    - remote::nvidia
     scoring:
     - inline::basic
     tool_runtime:
     - inline::rag-runtime
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/nvidia/doc_template.md b/llama_stack/templates/nvidia/doc_template.md
index 068dd7ac3..50c96802f 100644
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@@ -116,7 +116,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py
index 463c13879..bfd004037 100644
--- a/llama_stack/templates/nvidia/nvidia.py
+++ b/llama_stack/templates/nvidia/nvidia.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput
+from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
 from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
@@ -23,7 +24,7 @@ def get_distribution_template() -> DistributionTemplate:
         "telemetry": ["inline::meta-reference"],
         "eval": ["remote::nvidia"],
         "post_training": ["remote::nvidia"],
-        "datasetio": ["inline::localfs"],
+        "datasetio": ["inline::localfs", "remote::nvidia"],
         "scoring": ["inline::basic"],
         "tool_runtime": ["inline::rag-runtime"],
     }
@@ -38,6 +39,11 @@ def get_distribution_template() -> DistributionTemplate:
         provider_type="remote::nvidia",
         config=NVIDIASafetyConfig.sample_run_config(),
     )
+    datasetio_provider = Provider(
+        provider_id="nvidia",
+        provider_type="remote::nvidia",
+        config=NvidiaDatasetIOConfig.sample_run_config(),
+    )
     eval_provider = Provider(
         provider_id="nvidia",
         provider_type="remote::nvidia",
@@ -75,6 +81,7 @@ def get_distribution_template() -> DistributionTemplate:
             "run.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider],
+                    "datasetio": [datasetio_provider],
                     "eval": [eval_provider],
                 },
                 default_models=default_models,
diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml
index a3e5fefa4..c431e12f2 100644
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
   eval:
   - provider_id: nvidia
     provider_type: remote::nvidia
@@ -74,6 +77,13 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db
+  - provider_id: nvidia
+    provider_type: remote::nvidia
+    config:
+      api_key: ${env.NVIDIA_API_KEY:}
+      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default}
+      project_id: ${env.NVIDIA_PROJECT_ID:test-project}
+      datasets_url: ${env.NVIDIA_DATASETS_URL:http://nemo.test}
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -85,6 +95,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 271ce1a16..5b244081d 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/trace_store.db
   eval:
   - provider_id: nvidia
     provider_type: remote::nvidia
@@ -62,13 +65,13 @@ providers:
       project_id: ${env.NVIDIA_PROJECT_ID:test-project}
       customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:http://nemo.test}
   datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
+  - provider_id: nvidia
+    provider_type: remote::nvidia
     config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db
+      api_key: ${env.NVIDIA_API_KEY:}
+      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:default}
+      project_id: ${env.NVIDIA_PROJECT_ID:test-project}
+      datasets_url: ${env.NVIDIA_DATASETS_URL:http://nemo.test}
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -80,6 +83,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/inference_store.db
 models:
 - metadata: {}
   model_id: meta/llama3-8b-instruct
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index 37b72fc1f..36a120897 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -23,11 +23,15 @@ distribution_spec:
     - inline::basic
     - inline::llm-as-judge
     - inline::braintrust
+    post_training:
+    - inline::huggingface
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
     - remote::wolfram-alpha
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index f961ab7ed..aaa65bab2 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -86,7 +86,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env SAFETY_MODEL=$SAFETY_MODEL \
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index d9f0960a2..0b4f05128 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
     ShieldInput,
     ToolGroupInput,
 )
+from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@@ -28,10 +29,10 @@ def get_distribution_template() -> DistributionTemplate:
         "eval": ["inline::meta-reference"],
         "datasetio": ["remote::huggingface", "inline::localfs"],
         "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "post_training": ["inline::huggingface"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
             "remote::wolfram-alpha",
@@ -48,7 +49,11 @@ def get_distribution_template() -> DistributionTemplate:
         provider_type="inline::faiss",
         config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
     )
-
+    posttraining_provider = Provider(
+        provider_id="huggingface",
+        provider_type="inline::huggingface",
+        config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
     inference_model = ModelInput(
         model_id="${env.INFERENCE_MODEL}",
         provider_id="ollama",
@@ -75,10 +80,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
         ToolGroupInput(
             toolgroup_id="builtin::wolfram_alpha",
             provider_id="wolfram-alpha",
@@ -97,6 +98,7 @@ def get_distribution_template() -> DistributionTemplate:
                 provider_overrides={
                     "inference": [inference_provider],
                     "vector_io": [vector_io_provider_faiss],
+                    "post_training": [posttraining_provider],
                 },
                 default_models=[inference_model, embedding_model],
                 default_tool_groups=default_tool_groups,
@@ -105,6 +107,7 @@ def get_distribution_template() -> DistributionTemplate:
                 provider_overrides={
                     "inference": [inference_provider],
                     "vector_io": [vector_io_provider_faiss],
+                    "post_training": [posttraining_provider],
                     "safety": [
                         Provider(
                             provider_id="llama-guard",
diff --git a/llama_stack/templates/ollama/report.md b/llama_stack/templates/ollama/report.md
deleted file mode 100644
index 724809a59..000000000
--- a/llama_stack/templates/ollama/report.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Report for ollama distribution
-
-## Supported Models
-| Model Descriptor | ollama |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ❌ |
-| Llama-Guard-3-1B | ✅ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
-| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index b43fec6db..d63c5e366 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- post_training
 - safety
 - scoring
 - telemetry
@@ -39,13 +40,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -80,6 +84,13 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -91,9 +102,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -107,6 +115,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -136,8 +147,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index c8f4ad9ad..d208cd7f0 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- post_training
 - safety
 - scoring
 - telemetry
@@ -37,13 +38,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -78,6 +82,13 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -89,9 +100,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -105,6 +113,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -126,8 +137,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/open-benchmark/build.yaml b/llama_stack/templates/open-benchmark/build.yaml
index 1db90ef27..840f1e1db 100644
--- a/llama_stack/templates/open-benchmark/build.yaml
+++ b/llama_stack/templates/open-benchmark/build.yaml
@@ -30,7 +30,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
index a6a906c6f..d944d4eff 100644
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict, List, Tuple
 
 from llama_stack.apis.datasets import DatasetPurpose, URIDataSource
 from llama_stack.apis.models.models import ModelType
@@ -36,7 +35,7 @@ from llama_stack.templates.template import (
 )
 
 
-def get_inference_providers() -> Tuple[List[Provider], Dict[str, List[ProviderModelEntry]]]:
+def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
     # in this template, we allow each API key to be optional
     providers = [
         (
@@ -108,7 +107,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -146,10 +144,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     default_models = get_model_registry(available_models) + [
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 5e908b081..0e5edf728 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -64,13 +64,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -116,9 +119,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -128,6 +128,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/inference_store.db
 models:
 - metadata: {}
   model_id: openai/gpt-4o
@@ -242,7 +245,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml
index fb1fb1066..46b99cb75 100644
--- a/llama_stack/templates/passthrough/build.yaml
+++ b/llama_stack/templates/passthrough/build.yaml
@@ -28,7 +28,9 @@ distribution_spec:
     - remote::brave-search
     - remote::tavily-search
     - remote::wolfram-alpha
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py
index 8454e49cf..6a30625c5 100644
--- a/llama_stack/templates/passthrough/passthrough.py
+++ b/llama_stack/templates/passthrough/passthrough.py
@@ -38,7 +38,6 @@ def get_distribution_template() -> DistributionTemplate:
             "remote::brave-search",
             "remote::tavily-search",
             "remote::wolfram-alpha",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -100,10 +99,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml
index 8ab6b1081..bbf5d9a52 100644
--- a/llama_stack/templates/passthrough/run-with-safety.yaml
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -102,9 +105,6 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -114,6 +114,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/inference_store.db
 models:
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
@@ -148,7 +151,5 @@ tool_groups:
   provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml
index 53e8c8857..146906d9b 100644
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -97,9 +100,6 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -109,6 +109,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/inference_store.db
 models:
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
@@ -138,7 +141,5 @@ tool_groups:
   provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/postgres-demo/__init__.py b/llama_stack/templates/postgres-demo/__init__.py
new file mode 100644
index 000000000..81473cb73
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .postgres_demo import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/postgres-demo/build.yaml b/llama_stack/templates/postgres-demo/build.yaml
new file mode 100644
index 000000000..8f3648abe
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/build.yaml
@@ -0,0 +1,24 @@
+version: '2'
+distribution_spec:
+  description: Quick start template for running Llama Stack with several popular providers
+  providers:
+    inference:
+    - remote::fireworks
+    - remote::vllm
+    vector_io:
+    - remote::chromadb
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
+additional_pip_packages:
+- asyncpg
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/postgres-demo/postgres_demo.py b/llama_stack/templates/postgres-demo/postgres_demo.py
new file mode 100644
index 000000000..d2e352320
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/postgres_demo.py
@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
+from llama_stack.providers.remote.inference.fireworks.models import (
+    MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES,
+)
+from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
+from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+    get_model_registry,
+)
+
+
+def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
+    # in this template, we allow each API key to be optional
+    providers = [
+        (
+            "fireworks",
+            FIREWORKS_MODEL_ENTRIES,
+            FireworksImplConfig.sample_run_config(api_key="${env.FIREWORKS_API_KEY:}"),
+        ),
+    ]
+    inference_providers = []
+    available_models = {}
+    for provider_id, model_entries, config in providers:
+        inference_providers.append(
+            Provider(
+                provider_id=provider_id,
+                provider_type=f"remote::{provider_id}",
+                config=config,
+            )
+        )
+        available_models[provider_id] = model_entries
+    inference_providers.append(
+        Provider(
+            provider_id="vllm-inference",
+            provider_type="remote::vllm",
+            config=VLLMInferenceAdapterConfig.sample_run_config(
+                url="${env.VLLM_URL:http://localhost:8000/v1}",
+            ),
+        )
+    )
+    return inference_providers, available_models
+
+
+def get_distribution_template() -> DistributionTemplate:
+    inference_providers, available_models = get_inference_providers()
+    providers = {
+        "inference": ([p.provider_type for p in inference_providers]),
+        "vector_io": ["remote::chromadb"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "postgres-demo"
+
+    vector_io_providers = [
+        Provider(
+            provider_id="${env.ENABLE_CHROMADB+chromadb}",
+            provider_type="remote::chromadb",
+            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
+        ),
+    ]
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+    ]
+
+    default_models = get_model_registry(available_models)
+    default_models.append(
+        ModelInput(
+            model_id="${env.INFERENCE_MODEL}",
+            provider_id="vllm-inference",
+        )
+    )
+    postgres_config = {
+        "type": "postgres",
+        "host": "${env.POSTGRES_HOST:localhost}",
+        "port": "${env.POSTGRES_PORT:5432}",
+        "db": "${env.POSTGRES_DB:llamastack}",
+        "user": "${env.POSTGRES_USER:llamastack}",
+        "password": "${env.POSTGRES_PASSWORD:llamastack}",
+    }
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Quick start template for running Llama Stack with several popular providers",
+        container_image=None,
+        template_path=None,
+        providers=providers,
+        available_models_by_provider=available_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": inference_providers,
+                    "vector_io": vector_io_providers,
+                    "agents": [
+                        Provider(
+                            provider_id="meta-reference",
+                            provider_type="inline::meta-reference",
+                            config=dict(
+                                persistence_store=postgres_config,
+                                responses_store=postgres_config,
+                            ),
+                        )
+                    ],
+                    "telemetry": [
+                        Provider(
+                            provider_id="meta-reference",
+                            provider_type="inline::meta-reference",
+                            config=dict(
+                                service_name="${env.OTEL_SERVICE_NAME:}",
+                                sinks="${env.TELEMETRY_SINKS:console}",
+                            ),
+                        )
+                    ],
+                },
+                default_models=default_models,
+                default_tool_groups=default_tool_groups,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                metadata_store=PostgresKVStoreConfig.model_validate(postgres_config),
+                inference_store=PostgresSqlStoreConfig.model_validate(postgres_config),
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "8321",
+                "Port for the Llama Stack distribution server",
+            ),
+            "FIREWORKS_API_KEY": (
+                "",
+                "Fireworks API Key",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/postgres-demo/run.yaml b/llama_stack/templates/postgres-demo/run.yaml
new file mode 100644
index 000000000..889b8eaa7
--- /dev/null
+++ b/llama_stack/templates/postgres-demo/run.yaml
@@ -0,0 +1,224 @@
+version: '2'
+image_name: postgres-demo
+apis:
+- agents
+- inference
+- safety
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:}
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:true}
+  vector_io:
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:localhost}
+        port: ${env.POSTGRES_PORT:5432}
+        db: ${env.POSTGRES_DB:llamastack}
+        user: ${env.POSTGRES_USER:llamastack}
+        password: ${env.POSTGRES_PASSWORD:llamastack}
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:}
+      sinks: ${env.TELEMETRY_SINKS:console}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:localhost}
+  port: ${env.POSTGRES_PORT:5432}
+  db: ${env.POSTGRES_DB:llamastack}
+  user: ${env.POSTGRES_USER:llamastack}
+  password: ${env.POSTGRES_PASSWORD:llamastack}
+models:
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-70B-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-guard-3-8b
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-guard-3-8b
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama-guard-3-11b-vision
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama4-scout-instruct-basic
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic
+  model_type: llm
+- metadata: {}
+  model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: fireworks
+  provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic
+  model_type: llm
+- metadata:
+    embedding_dimension: 768
+    context_length: 8192
+  model_id: nomic-ai/nomic-embed-text-v1.5
+  provider_id: fireworks
+  provider_model_id: nomic-ai/nomic-embed-text-v1.5
+  model_type: embedding
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index b2bbf853a..16fe5d4fd 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -27,8 +27,10 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
     - remote::wolfram-alpha
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md
index 3cede6080..5684888da 100644
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@@ -220,7 +220,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@@ -242,7 +242,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index bb69496aa..e83162a4f 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -50,6 +50,9 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/responses_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -88,9 +91,9 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -102,9 +105,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -118,6 +118,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -143,8 +146,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 14f2da37e..4cdf88c6b 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -43,6 +43,9 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/responses_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -81,9 +84,9 @@ providers:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/trace_store.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -95,9 +98,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -111,6 +111,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -131,8 +134,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py
index 0f6c7659e..2782a3ea0 100644
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@@ -34,7 +34,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
             "remote::wolfram-alpha",
@@ -84,10 +83,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
         ToolGroupInput(
             toolgroup_id="builtin::wolfram_alpha",
             provider_id="wolfram-alpha",
diff --git a/llama_stack/templates/sambanova/build.yaml b/llama_stack/templates/sambanova/build.yaml
index ca5ffe618..14b1c8974 100644
--- a/llama_stack/templates/sambanova/build.yaml
+++ b/llama_stack/templates/sambanova/build.yaml
@@ -1,15 +1,16 @@
 version: '2'
 distribution_spec:
-  description: Use SambaNova.AI for running LLM inference
+  description: Use SambaNova for running LLM inference and safety
   providers:
     inference:
     - remote::sambanova
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     - remote::chromadb
     - remote::pgvector
     safety:
-    - inline::llama-guard
+    - remote::sambanova
     agents:
     - inline::meta-reference
     telemetry:
@@ -17,6 +18,10 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
+    - remote::model-context-protocol
+    - remote::wolfram-alpha
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/sambanova/doc_template.md b/llama_stack/templates/sambanova/doc_template.md
index 42d9efb66..1dc76fd3f 100644
--- a/llama_stack/templates/sambanova/doc_template.md
+++ b/llama_stack/templates/sambanova/doc_template.md
@@ -37,33 +37,44 @@ The following models are available by default:
 
 ### Prerequisite: API Keys
 
-Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup).
 
 
 ## Running Llama Stack with SambaNova
 
 You can do this via Conda (build code) or Docker which has a pre-built image.
 
-### Via Docker
 
-This method allows you to get started quickly without having to build the distribution code.
+### Via Docker
 
 ```bash
 LLAMA_STACK_PORT=8321
+llama stack build --template sambanova --image-type container
 docker run \
   -it \
-  --pull always \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  llamastack/distribution-{{ name }} \
+  -v ~/.llama:/root/.llama \
+  distribution-{{ name }} \
   --port $LLAMA_STACK_PORT \
   --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
 
+
+### Via Venv
+
+```bash
+llama stack build --template sambanova --image-type venv
+llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
+```
+
+
 ### Via Conda
 
 ```bash
 llama stack build --template sambanova --image-type conda
-llama stack run ./run.yaml \
+llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
 ```
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index e4e8e4e21..8c2a933ab 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -14,6 +14,9 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -35,10 +38,11 @@ providers:
       user: ${env.PGVECTOR_USER:}
       password: ${env.PGVECTOR_PASSWORD:}
   safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
+  - provider_id: sambanova
+    provider_type: remote::sambanova
     config:
-      excluded_categories: []
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY}
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -47,13 +51,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/sambanova/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/trace_store.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -65,118 +72,133 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/inference_store.db
 models:
 - metadata: {}
-  model_id: Meta-Llama-3.1-8B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: Meta-Llama-3.1-405B-Instruct
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-1B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-1B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-3B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-3B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.3-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.3-70B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-11B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-90B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-Guard-3-8B
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields:
 - shield_id: meta-llama/Llama-Guard-3-8B
+  provider_shield_id: sambanova/Meta-Llama-Guard-3-8B
+- shield_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_shield_id: sambanova/Meta-Llama-Guard-3-8B
 vector_dbs: []
 datasets: []
 scoring_fns: []
@@ -186,7 +208,7 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/sambanova/sambanova.py b/llama_stack/templates/sambanova/sambanova.py
index 8b91f8712..54a49423d 100644
--- a/llama_stack/templates/sambanova/sambanova.py
+++ b/llama_stack/templates/sambanova/sambanova.py
@@ -6,7 +6,16 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import Provider, ShieldInput, ToolGroupInput
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig
 from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES
@@ -23,26 +32,38 @@ from llama_stack.templates.template import (
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::sambanova"],
+        "inference": ["remote::sambanova", "inline::sentence-transformers"],
         "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
+        "safety": ["remote::sambanova"],
         "agents": ["inline::meta-reference"],
         "telemetry": ["inline::meta-reference"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
+            "remote::model-context-protocol",
+            "remote::wolfram-alpha",
         ],
     }
     name = "sambanova"
-
     inference_provider = Provider(
         provider_id=name,
         provider_type=f"remote::{name}",
         config=SambaNovaImplConfig.sample_run_config(),
     )
-
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
     vector_io_providers = [
         Provider(
             provider_id="faiss",
@@ -81,27 +102,35 @@ def get_distribution_template() -> DistributionTemplate:
             provider_id="rag-runtime",
         ),
         ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
         ),
     ]
 
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Use SambaNova.AI for running LLM inference",
-        docker_image=None,
+        description="Use SambaNova for running LLM inference and safety",
+        container_image=None,
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
         available_models_by_provider=available_models,
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider],
+                    "inference": [inference_provider, embedding_provider],
                     "vector_io": vector_io_providers,
                 },
-                default_models=default_models,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                default_models=default_models + [embedding_model],
+                default_shields=[
+                    ShieldInput(
+                        shield_id="meta-llama/Llama-Guard-3-8B", provider_shield_id="sambanova/Meta-Llama-Guard-3-8B"
+                    ),
+                    ShieldInput(
+                        shield_id="sambanova/Meta-Llama-Guard-3-8B",
+                        provider_shield_id="sambanova/Meta-Llama-Guard-3-8B",
+                    ),
+                ],
                 default_tool_groups=default_tool_groups,
             ),
         },
@@ -112,7 +141,7 @@ def get_distribution_template() -> DistributionTemplate:
             ),
             "SAMBANOVA_API_KEY": (
                 "",
-                "SambaNova.AI API Key",
+                "SambaNova API Key",
             ),
         },
     )
diff --git a/llama_stack/templates/starter/__init__.py b/llama_stack/templates/starter/__init__.py
new file mode 100644
index 000000000..9c0d937ce
--- /dev/null
+++ b/llama_stack/templates/starter/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .starter import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/starter/build.yaml b/llama_stack/templates/starter/build.yaml
new file mode 100644
index 000000000..ec97c7d3e
--- /dev/null
+++ b/llama_stack/templates/starter/build.yaml
@@ -0,0 +1,40 @@
+version: '2'
+distribution_spec:
+  description: Quick start template for running Llama Stack with several popular providers
+  providers:
+    inference:
+    - remote::openai
+    - remote::fireworks
+    - remote::anthropic
+    - remote::gemini
+    - remote::groq
+    - remote::sambanova
+    - inline::sentence-transformers
+    vector_io:
+    - inline::sqlite-vec
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/starter/run.yaml
similarity index 66%
rename from llama_stack/templates/dev/run.yaml
rename to llama_stack/templates/starter/run.yaml
index 0dd056405..04425ed35 100644
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@@ -1,5 +1,5 @@
 version: '2'
-image_name: dev
+image_name: starter
 apis:
 - agents
 - datasetio
@@ -34,6 +34,11 @@ providers:
     config:
       url: https://api.groq.com
       api_key: ${env.GROQ_API_KEY:}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
@@ -41,7 +46,7 @@ providers:
   - provider_id: sqlite-vec
     provider_type: inline::sqlite-vec
     config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
   - provider_id: ${env.ENABLE_CHROMADB+chromadb}
     provider_type: remote::chromadb
     config:
@@ -66,14 +71,17 @@ providers:
       persistence_store:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -81,7 +89,7 @@ providers:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/meta_reference_eval.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
@@ -89,14 +97,14 @@ providers:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/huggingface_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
     config:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/localfs_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -119,9 +127,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -130,7 +135,10 @@ providers:
     config: {}
 metadata_store:
   type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/inference_store.db
 models:
 - metadata: {}
   model_id: openai/gpt-4o
@@ -147,6 +155,76 @@ models:
   provider_id: openai
   provider_model_id: openai/chatgpt-4o-latest
   model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-0125
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-0125
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-instruct
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-instruct
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4
+  provider_id: openai
+  provider_model_id: gpt-4
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4-turbo
+  provider_id: openai
+  provider_model_id: gpt-4-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o
+  provider_id: openai
+  provider_model_id: gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-2024-08-06
+  provider_id: openai
+  provider_model_id: gpt-4o-2024-08-06
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-mini
+  provider_id: openai
+  provider_model_id: gpt-4o-mini
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-audio-preview
+  provider_id: openai
+  provider_model_id: gpt-4o-audio-preview
+  model_type: llm
+- metadata: {}
+  model_id: chatgpt-4o-latest
+  provider_id: openai
+  provider_model_id: chatgpt-4o-latest
+  model_type: llm
+- metadata: {}
+  model_id: o1
+  provider_id: openai
+  provider_model_id: o1
+  model_type: llm
+- metadata: {}
+  model_id: o1-mini
+  provider_id: openai
+  provider_model_id: o1-mini
+  model_type: llm
+- metadata: {}
+  model_id: o3-mini
+  provider_id: openai
+  provider_model_id: o3-mini
+  model_type: llm
+- metadata: {}
+  model_id: o4-mini
+  provider_id: openai
+  provider_model_id: o4-mini
+  model_type: llm
 - metadata:
     embedding_dimension: 1536
     context_length: 8192
@@ -161,6 +239,20 @@ models:
   provider_id: openai
   provider_model_id: openai/text-embedding-3-large
   model_type: embedding
+- metadata:
+    embedding_dimension: 1536
+    context_length: 8192
+  model_id: text-embedding-3-small
+  provider_id: openai
+  provider_model_id: text-embedding-3-small
+  model_type: embedding
+- metadata:
+    embedding_dimension: 3072
+    context_length: 8192
+  model_id: text-embedding-3-large
+  provider_id: openai
+  provider_model_id: text-embedding-3-large
+  model_type: embedding
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   provider_id: fireworks
@@ -416,6 +508,106 @@ models:
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-1B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
 - metadata:
     embedding_dimension: 384
   model_id: all-MiniLM-L6-v2
@@ -432,7 +624,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/dev/dev.py b/llama_stack/templates/starter/starter.py
similarity index 91%
rename from llama_stack/templates/dev/dev.py
rename to llama_stack/templates/starter/starter.py
index 69924acbe..0932bfdfe 100644
--- a/llama_stack/templates/dev/dev.py
+++ b/llama_stack/templates/starter/starter.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Tuple
 
 from llama_stack.apis.models.models import ModelType
 from llama_stack.distribution.datatypes import (
@@ -39,10 +38,15 @@ from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
 from llama_stack.providers.remote.inference.openai.models import (
     MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
 )
+from llama_stack.providers.remote.inference.sambanova.config import SambaNovaImplConfig
+from llama_stack.providers.remote.inference.sambanova.models import (
+    MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES,
+)
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.remote.vector_io.pgvector.config import (
     PGVectorVectorIOConfig,
 )
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
 from llama_stack.templates.template import (
     DistributionTemplate,
     RunConfigSettings,
@@ -50,7 +54,7 @@ from llama_stack.templates.template import (
 )
 
 
-def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
+def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
     # in this template, we allow each API key to be optional
     providers = [
         (
@@ -78,6 +82,11 @@ def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
             GROQ_MODEL_ENTRIES,
             GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"),
         ),
+        (
+            "sambanova",
+            SAMBANOVA_MODEL_ENTRIES,
+            SambaNovaImplConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:}"),
+        ),
     ]
     inference_providers = []
     available_models = {}
@@ -107,12 +116,11 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
     }
-    name = "dev"
+    name = "starter"
 
     vector_io_providers = [
         Provider(
@@ -150,10 +158,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
     embedding_model = ModelInput(
         model_id="all-MiniLM-L6-v2",
@@ -168,7 +172,7 @@ def get_distribution_template() -> DistributionTemplate:
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Distribution for running e2e tests in CI",
+        description="Quick start template for running Llama Stack with several popular providers",
         container_image=None,
         template_path=None,
         providers=providers,
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index 92b1b534d..4013f08f9 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Tuple
+from typing import Literal
 
 import jinja2
 import yaml
@@ -28,12 +28,13 @@ from llama_stack.distribution.datatypes import (
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig
 
 
 def get_model_registry(
-    available_models: Dict[str, List[ProviderModelEntry]],
-) -> List[ModelInput]:
+    available_models: dict[str, list[ProviderModelEntry]],
+) -> list[ModelInput]:
     models = []
     for provider_id, entries in available_models.items():
         for entry in entries:
@@ -57,18 +58,20 @@ class DefaultModel(BaseModel):
 
 
 class RunConfigSettings(BaseModel):
-    provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
-    default_models: Optional[List[ModelInput]] = None
-    default_shields: Optional[List[ShieldInput]] = None
-    default_tool_groups: Optional[List[ToolGroupInput]] = None
-    default_datasets: Optional[List[DatasetInput]] = None
-    default_benchmarks: Optional[List[BenchmarkInput]] = None
+    provider_overrides: dict[str, list[Provider]] = Field(default_factory=dict)
+    default_models: list[ModelInput] | None = None
+    default_shields: list[ShieldInput] | None = None
+    default_tool_groups: list[ToolGroupInput] | None = None
+    default_datasets: list[DatasetInput] | None = None
+    default_benchmarks: list[BenchmarkInput] | None = None
+    metadata_store: KVStoreConfig | None = None
+    inference_store: SqlStoreConfig | None = None
 
     def run_config(
         self,
         name: str,
-        providers: Dict[str, List[str]],
-        container_image: Optional[str] = None,
+        providers: dict[str, list[str]],
+        container_image: str | None = None,
     ) -> StackRunConfig:
         provider_registry = get_provider_registry()
 
@@ -113,10 +116,16 @@ class RunConfigSettings(BaseModel):
             container_image=container_image,
             apis=apis,
             providers=provider_configs,
-            metadata_store=SqliteKVStoreConfig.sample_run_config(
+            metadata_store=self.metadata_store
+            or SqliteKVStoreConfig.sample_run_config(
                 __distro_dir__=f"~/.llama/distributions/{name}",
                 db_name="registry.db",
             ),
+            inference_store=self.inference_store
+            or SqliteSqlStoreConfig.sample_run_config(
+                __distro_dir__=f"~/.llama/distributions/{name}",
+                db_name="inference_store.db",
+            ),
             models=self.default_models or [],
             shields=self.default_shields or [],
             tool_groups=self.default_tool_groups or [],
@@ -135,25 +144,31 @@ class DistributionTemplate(BaseModel):
     description: str
     distro_type: Literal["self_hosted", "remote_hosted", "ondevice"]
 
-    providers: Dict[str, List[str]]
-    run_configs: Dict[str, RunConfigSettings]
-    template_path: Optional[Path] = None
+    providers: dict[str, list[str]]
+    run_configs: dict[str, RunConfigSettings]
+    template_path: Path | None = None
 
     # Optional configuration
-    run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
-    container_image: Optional[str] = None
+    run_config_env_vars: dict[str, tuple[str, str]] | None = None
+    container_image: str | None = None
 
-    available_models_by_provider: Optional[Dict[str, List[ProviderModelEntry]]] = None
+    available_models_by_provider: dict[str, list[ProviderModelEntry]] | None = None
 
     def build_config(self) -> BuildConfig:
+        additional_pip_packages: list[str] = []
+        for run_config in self.run_configs.values():
+            run_config_ = run_config.run_config(self.name, self.providers, self.container_image)
+            if run_config_.inference_store:
+                additional_pip_packages.extend(run_config_.inference_store.pip_packages)
+
         return BuildConfig(
-            name=self.name,
             distribution_spec=DistributionSpec(
                 description=self.description,
                 container_image=self.container_image,
                 providers=self.providers,
             ),
             image_type="conda",  # default to conda, can be overridden
+            additional_pip_packages=sorted(set(additional_pip_packages)),
         )
 
     def generate_markdown_docs(self) -> str:
diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml
index 9fe79647c..361b0b680 100644
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@@ -27,7 +27,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md
index b69ccaa56..68b475893 100644
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@@ -105,7 +105,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
diff --git a/llama_stack/templates/tgi/report.md b/llama_stack/templates/tgi/report.md
deleted file mode 100644
index b0f5d88a2..000000000
--- a/llama_stack/templates/tgi/report.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Report for tgi distribution
-
-## Supported Models
-| Model Descriptor | tgi |
-|:---|:---|
-| Llama-3-8B-Instruct | ✅ |
-| Llama-3-70B-Instruct | ✅ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ✅ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ✅ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
-| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index 12d6bd284..c797b93aa 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,9 +96,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -105,6 +105,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -125,7 +128,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 9f05c7584..7e91d20bd 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -40,13 +40,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -92,9 +95,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -104,6 +104,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -124,7 +127,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py
index 22dcc3995..2c97cbf80 100644
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@@ -34,7 +34,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -83,10 +82,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index 834a3ecaf..5ffeac873 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -27,8 +27,10 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
     - remote::wolfram-alpha
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/together/report.md b/llama_stack/templates/together/report.md
deleted file mode 100644
index e125d5665..000000000
--- a/llama_stack/templates/together/report.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Report for together distribution
-
-## Supported Models
-| Model Descriptor | together |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ❌ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ❌ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
-| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
-| inline::meta-reference | /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 105ce896d..190a0400b 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -46,13 +46,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -98,9 +101,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -114,6 +114,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/inference_store.db
 models:
 - metadata: {}
   model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
@@ -270,8 +273,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 1f1613655..ce9542130 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -41,13 +41,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -93,9 +96,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -109,6 +109,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/inference_store.db
 models:
 - metadata: {}
   model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
@@ -260,8 +263,6 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 - toolgroup_id: builtin::wolfram_alpha
   provider_id: wolfram-alpha
 server:
diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py
index a2bd87c97..7761bd9fd 100644
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@@ -39,7 +39,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
             "remote::wolfram-alpha",
@@ -74,10 +73,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
         ToolGroupInput(
             toolgroup_id="builtin::wolfram_alpha",
             provider_id="wolfram-alpha",
diff --git a/llama_stack/templates/verification/build.yaml b/llama_stack/templates/verification/build.yaml
index 9f010d651..ce083dbba 100644
--- a/llama_stack/templates/verification/build.yaml
+++ b/llama_stack/templates/verification/build.yaml
@@ -32,7 +32,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml
index 454ecba5b..58b3c576c 100644
--- a/llama_stack/templates/verification/run.yaml
+++ b/llama_stack/templates/verification/run.yaml
@@ -74,13 +74,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/verification/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -126,9 +129,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -138,6 +138,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/inference_store.db
 models:
 - metadata: {}
   model_id: openai/gpt-4o
@@ -154,6 +157,76 @@ models:
   provider_id: openai
   provider_model_id: openai/chatgpt-4o-latest
   model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-0125
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-0125
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-instruct
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-instruct
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4
+  provider_id: openai
+  provider_model_id: gpt-4
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4-turbo
+  provider_id: openai
+  provider_model_id: gpt-4-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o
+  provider_id: openai
+  provider_model_id: gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-2024-08-06
+  provider_id: openai
+  provider_model_id: gpt-4o-2024-08-06
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-mini
+  provider_id: openai
+  provider_model_id: gpt-4o-mini
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-audio-preview
+  provider_id: openai
+  provider_model_id: gpt-4o-audio-preview
+  model_type: llm
+- metadata: {}
+  model_id: chatgpt-4o-latest
+  provider_id: openai
+  provider_model_id: chatgpt-4o-latest
+  model_type: llm
+- metadata: {}
+  model_id: o1
+  provider_id: openai
+  provider_model_id: o1
+  model_type: llm
+- metadata: {}
+  model_id: o1-mini
+  provider_id: openai
+  provider_model_id: o1-mini
+  model_type: llm
+- metadata: {}
+  model_id: o3-mini
+  provider_id: openai
+  provider_model_id: o3-mini
+  model_type: llm
+- metadata: {}
+  model_id: o4-mini
+  provider_id: openai
+  provider_model_id: o4-mini
+  model_type: llm
 - metadata:
     embedding_dimension: 1536
     context_length: 8192
@@ -168,6 +241,20 @@ models:
   provider_id: openai
   provider_model_id: openai/text-embedding-3-large
   model_type: embedding
+- metadata:
+    embedding_dimension: 1536
+    context_length: 8192
+  model_id: text-embedding-3-small
+  provider_id: openai
+  provider_model_id: text-embedding-3-small
+  model_type: embedding
+- metadata:
+    embedding_dimension: 3072
+    context_length: 8192
+  model_id: text-embedding-3-large
+  provider_id: openai
+  provider_model_id: text-embedding-3-large
+  model_type: embedding
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   provider_id: fireworks-openai-compat
@@ -505,104 +592,104 @@ models:
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-8B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: Meta-Llama-3.1-405B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-1B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-1B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-3B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-3B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.3-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.3-70B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-11B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-90B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-Guard-3-8B
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
   model_type: llm
 - metadata: {}
   model_id: llama3.1-8b
@@ -640,7 +727,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/verification/verification.py b/llama_stack/templates/verification/verification.py
index e6f74aad8..b58400f26 100644
--- a/llama_stack/templates/verification/verification.py
+++ b/llama_stack/templates/verification/verification.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict, List, Tuple
 
 from llama_stack.apis.models.models import ModelType
 from llama_stack.distribution.datatypes import (
@@ -51,7 +50,7 @@ from llama_stack.templates.template import (
 )
 
 
-def get_inference_providers() -> Tuple[List[Provider], Dict[str, List[ProviderModelEntry]]]:
+def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
     # in this template, we allow each API key to be optional
     providers = [
         (
@@ -113,7 +112,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -156,10 +154,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
     embedding_model = ModelInput(
         model_id="all-MiniLM-L6-v2",
diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml
index 8eb44dc1b..d5ff0f1f4 100644
--- a/llama_stack/templates/vllm-gpu/build.yaml
+++ b/llama_stack/templates/vllm-gpu/build.yaml
@@ -27,7 +27,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index a839aa2c5..6937e2bac 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -45,13 +45,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -97,9 +100,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -109,6 +109,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/inference_store.db
 models:
 - metadata: {}
   model_id: ${env.INFERENCE_MODEL}
@@ -129,7 +132,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py
index 9bfeadc8d..5775138b1 100644
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@@ -31,7 +31,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -75,10 +74,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/templates/watsonx/build.yaml b/llama_stack/templates/watsonx/build.yaml
index badd643ad..e68ace183 100644
--- a/llama_stack/templates/watsonx/build.yaml
+++ b/llama_stack/templates/watsonx/build.yaml
@@ -4,6 +4,7 @@ distribution_spec:
   providers:
     inference:
     - remote::watsonx
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     safety:
@@ -24,7 +25,9 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
-    - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
 image_type: conda
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/watsonx/doc_template.md b/llama_stack/templates/watsonx/doc_template.md
index af0ae15a8..f28dbf0bf 100644
--- a/llama_stack/templates/watsonx/doc_template.md
+++ b/llama_stack/templates/watsonx/doc_template.md
@@ -56,7 +56,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env WATSONX_API_KEY=$WATSONX_API_KEY \
   --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
diff --git a/llama_stack/templates/watsonx/run.yaml b/llama_stack/templates/watsonx/run.yaml
index 1048f7192..e7222fd57 100644
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@@ -18,6 +18,9 @@ providers:
       url: ${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}
       api_key: ${env.WATSONX_API_KEY:}
       project_id: ${env.WATSONX_PROJECT_ID:}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -39,13 +42,16 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/responses_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
-      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
+      service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/watsonx/trace_store.db}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -91,9 +97,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
@@ -103,6 +106,9 @@ providers:
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/watsonx}/inference_store.db
 models:
 - metadata: {}
   model_id: meta-llama/llama-3-3-70b-instruct
@@ -194,6 +200,11 @@ models:
   provider_id: watsonx
   provider_model_id: meta-llama/llama-guard-3-11b-vision
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []
@@ -204,7 +215,5 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
 server:
   port: 8321
diff --git a/llama_stack/templates/watsonx/watsonx.py b/llama_stack/templates/watsonx/watsonx.py
index d59bb6f20..802aaf8f1 100644
--- a/llama_stack/templates/watsonx/watsonx.py
+++ b/llama_stack/templates/watsonx/watsonx.py
@@ -6,7 +6,11 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import Provider, ToolGroupInput
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
 from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
 from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
@@ -14,7 +18,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::watsonx"],
+        "inference": ["remote::watsonx", "inline::sentence-transformers"],
         "vector_io": ["inline::faiss"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -25,7 +29,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
-            "inline::code-interpreter",
             "inline::rag-runtime",
             "remote::model-context-protocol",
         ],
@@ -37,6 +40,12 @@ def get_distribution_template() -> DistributionTemplate:
         config=WatsonXConfig.sample_run_config(),
     )
 
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+
     available_models = {
         "watsonx": MODEL_ENTRIES,
     }
@@ -49,12 +58,17 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
     ]
 
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+
     default_models = get_model_registry(available_models)
     return DistributionTemplate(
         name="watsonx",
@@ -67,9 +81,9 @@ def get_distribution_template() -> DistributionTemplate:
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider],
+                    "inference": [inference_provider, embedding_provider],
                 },
-                default_models=default_models,
+                default_models=default_models + [embedding_model],
                 default_tool_groups=default_tool_groups,
             ),
         },
diff --git a/llama_stack/ui/.gitignore b/llama_stack/ui/.gitignore
new file mode 100644
index 000000000..5ef6a5207
--- /dev/null
+++ b/llama_stack/ui/.gitignore
@@ -0,0 +1,41 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/llama_stack/ui/.prettierignore b/llama_stack/ui/.prettierignore
new file mode 100644
index 000000000..1b8ac8894
--- /dev/null
+++ b/llama_stack/ui/.prettierignore
@@ -0,0 +1,3 @@
+# Ignore artifacts:
+build
+coverage
diff --git a/llama_stack/ui/.prettierrc b/llama_stack/ui/.prettierrc
new file mode 100644
index 000000000..0967ef424
--- /dev/null
+++ b/llama_stack/ui/.prettierrc
@@ -0,0 +1 @@
+{}
diff --git a/llama_stack/ui/README.md b/llama_stack/ui/README.md
new file mode 100644
index 000000000..b6f803509
--- /dev/null
+++ b/llama_stack/ui/README.md
@@ -0,0 +1,25 @@
+## This is WIP.
+
+We use shadcdn/ui [Shadcn UI](https://ui.shadcn.com/) for the UI components.
+
+## Getting Started
+
+First, install dependencies:
+
+```bash
+npm install
+```
+
+Then, run the development server:
+
+```bash
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
+# or
+bun dev
+```
+
+Open [http://localhost:8322](http://localhost:8322) with your browser to see the result.
diff --git a/llama_stack/ui/app/favicon.ico b/llama_stack/ui/app/favicon.ico
new file mode 100644
index 000000000..718d6fea4
Binary files /dev/null and b/llama_stack/ui/app/favicon.ico differ
diff --git a/llama_stack/ui/app/globals.css b/llama_stack/ui/app/globals.css
new file mode 100644
index 000000000..dc98be74c
--- /dev/null
+++ b/llama_stack/ui/app/globals.css
@@ -0,0 +1,122 @@
+@import "tailwindcss";
+@import "tw-animate-css";
+
+@custom-variant dark (&:is(.dark *));
+
+@theme inline {
+  --color-background: var(--background);
+  --color-foreground: var(--foreground);
+  --font-sans: var(--font-geist-sans);
+  --font-mono: var(--font-geist-mono);
+  --color-sidebar-ring: var(--sidebar-ring);
+  --color-sidebar-border: var(--sidebar-border);
+  --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
+  --color-sidebar-accent: var(--sidebar-accent);
+  --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
+  --color-sidebar-primary: var(--sidebar-primary);
+  --color-sidebar-foreground: var(--sidebar-foreground);
+  --color-sidebar: var(--sidebar);
+  --color-chart-5: var(--chart-5);
+  --color-chart-4: var(--chart-4);
+  --color-chart-3: var(--chart-3);
+  --color-chart-2: var(--chart-2);
+  --color-chart-1: var(--chart-1);
+  --color-ring: var(--ring);
+  --color-input: var(--input);
+  --color-border: var(--border);
+  --color-destructive: var(--destructive);
+  --color-accent-foreground: var(--accent-foreground);
+  --color-accent: var(--accent);
+  --color-muted-foreground: var(--muted-foreground);
+  --color-muted: var(--muted);
+  --color-secondary-foreground: var(--secondary-foreground);
+  --color-secondary: var(--secondary);
+  --color-primary-foreground: var(--primary-foreground);
+  --color-primary: var(--primary);
+  --color-popover-foreground: var(--popover-foreground);
+  --color-popover: var(--popover);
+  --color-card-foreground: var(--card-foreground);
+  --color-card: var(--card);
+  --radius-sm: calc(var(--radius) - 4px);
+  --radius-md: calc(var(--radius) - 2px);
+  --radius-lg: var(--radius);
+  --radius-xl: calc(var(--radius) + 4px);
+}
+
+:root {
+  --radius: 0.625rem;
+  --background: oklch(1 0 0);
+  --foreground: oklch(0.145 0 0);
+  --card: oklch(1 0 0);
+  --card-foreground: oklch(0.145 0 0);
+  --popover: oklch(1 0 0);
+  --popover-foreground: oklch(0.145 0 0);
+  --primary: oklch(0.205 0 0);
+  --primary-foreground: oklch(0.985 0 0);
+  --secondary: oklch(0.97 0 0);
+  --secondary-foreground: oklch(0.205 0 0);
+  --muted: oklch(0.97 0 0);
+  --muted-foreground: oklch(0.556 0 0);
+  --accent: oklch(0.97 0 0);
+  --accent-foreground: oklch(0.205 0 0);
+  --destructive: oklch(0.577 0.245 27.325);
+  --border: oklch(0.922 0 0);
+  --input: oklch(0.922 0 0);
+  --ring: oklch(0.708 0 0);
+  --chart-1: oklch(0.646 0.222 41.116);
+  --chart-2: oklch(0.6 0.118 184.704);
+  --chart-3: oklch(0.398 0.07 227.392);
+  --chart-4: oklch(0.828 0.189 84.429);
+  --chart-5: oklch(0.769 0.188 70.08);
+  --sidebar: oklch(0.985 0 0);
+  --sidebar-foreground: oklch(0.145 0 0);
+  --sidebar-primary: oklch(0.205 0 0);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.97 0 0);
+  --sidebar-accent-foreground: oklch(0.205 0 0);
+  --sidebar-border: oklch(0.922 0 0);
+  --sidebar-ring: oklch(0.708 0 0);
+}
+
+.dark {
+  --background: oklch(0.145 0 0);
+  --foreground: oklch(0.985 0 0);
+  --card: oklch(0.205 0 0);
+  --card-foreground: oklch(0.985 0 0);
+  --popover: oklch(0.205 0 0);
+  --popover-foreground: oklch(0.985 0 0);
+  --primary: oklch(0.922 0 0);
+  --primary-foreground: oklch(0.205 0 0);
+  --secondary: oklch(0.269 0 0);
+  --secondary-foreground: oklch(0.985 0 0);
+  --muted: oklch(0.269 0 0);
+  --muted-foreground: oklch(0.708 0 0);
+  --accent: oklch(0.269 0 0);
+  --accent-foreground: oklch(0.985 0 0);
+  --destructive: oklch(0.704 0.191 22.216);
+  --border: oklch(1 0 0 / 10%);
+  --input: oklch(1 0 0 / 15%);
+  --ring: oklch(0.556 0 0);
+  --chart-1: oklch(0.488 0.243 264.376);
+  --chart-2: oklch(0.696 0.17 162.48);
+  --chart-3: oklch(0.769 0.188 70.08);
+  --chart-4: oklch(0.627 0.265 303.9);
+  --chart-5: oklch(0.645 0.246 16.439);
+  --sidebar: oklch(0.205 0 0);
+  --sidebar-foreground: oklch(0.985 0 0);
+  --sidebar-primary: oklch(0.488 0.243 264.376);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.269 0 0);
+  --sidebar-accent-foreground: oklch(0.985 0 0);
+  --sidebar-border: oklch(1 0 0 / 10%);
+  --sidebar-ring: oklch(0.556 0 0);
+}
+
+@layer base {
+  * {
+    @apply border-border outline-ring/50;
+  }
+  body {
+    @apply bg-background text-foreground;
+  }
+}
diff --git a/llama_stack/ui/app/layout.tsx b/llama_stack/ui/app/layout.tsx
new file mode 100644
index 000000000..ed8a6cd5d
--- /dev/null
+++ b/llama_stack/ui/app/layout.tsx
@@ -0,0 +1,55 @@
+import type { Metadata } from "next";
+import { ThemeProvider } from "@/components/ui/theme-provider";
+import { Geist, Geist_Mono } from "next/font/google";
+import { ModeToggle } from "@/components/ui/mode-toggle";
+import "./globals.css";
+
+const geistSans = Geist({
+  variable: "--font-geist-sans",
+  subsets: ["latin"],
+});
+
+const geistMono = Geist_Mono({
+  variable: "--font-geist-mono",
+  subsets: ["latin"],
+});
+
+export const metadata: Metadata = {
+  title: "Llama Stack",
+  description: "Llama Stack UI",
+};
+
+import { SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
+import { AppSidebar } from "@/components/layout/app-sidebar";
+
+export default function Layout({ children }: { children: React.ReactNode }) {
+  return (
+    
+      
+        
+          
+            
+            
+ {/* Header with aligned elements */} +
+
+ +
+
+
+ +
+
+
{children}
+
+
+
+ + + ); +} diff --git a/llama_stack/ui/app/logs/chat-completions/[id]/page.tsx b/llama_stack/ui/app/logs/chat-completions/[id]/page.tsx new file mode 100644 index 000000000..e6feef363 --- /dev/null +++ b/llama_stack/ui/app/logs/chat-completions/[id]/page.tsx @@ -0,0 +1,58 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useParams } from "next/navigation"; +import { ChatCompletion } from "@/lib/types"; +import { ChatCompletionDetailView } from "@/components/chat-completions/chat-completion-detail"; +import { client } from "@/lib/client"; + +export default function ChatCompletionDetailPage() { + const params = useParams(); + const id = params.id as string; + + const [completionDetail, setCompletionDetail] = + useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + useEffect(() => { + if (!id) { + setError(new Error("Completion ID is missing.")); + setIsLoading(false); + return; + } + + const fetchCompletionDetail = async () => { + setIsLoading(true); + setError(null); + setCompletionDetail(null); + try { + const response = await client.chat.completions.retrieve(id); + setCompletionDetail(response as ChatCompletion); + } catch (err) { + console.error( + `Error fetching chat completion detail for ID ${id}:`, + err, + ); + setError( + err instanceof Error + ? err + : new Error("Failed to fetch completion detail"), + ); + } finally { + setIsLoading(false); + } + }; + + fetchCompletionDetail(); + }, [id]); + + return ( + + ); +} diff --git a/llama_stack/ui/app/logs/chat-completions/layout.tsx b/llama_stack/ui/app/logs/chat-completions/layout.tsx new file mode 100644 index 000000000..f4dbfc782 --- /dev/null +++ b/llama_stack/ui/app/logs/chat-completions/layout.tsx @@ -0,0 +1,19 @@ +"use client"; + +import React from "react"; +import LogsLayout from "@/components/layout/logs-layout"; + +export default function ChatCompletionsLayout({ + children, +}: { + children: React.ReactNode; +}) { + return ( + + {children} + + ); +} diff --git a/llama_stack/ui/app/logs/chat-completions/page.tsx b/llama_stack/ui/app/logs/chat-completions/page.tsx new file mode 100644 index 000000000..5bbfcce94 --- /dev/null +++ b/llama_stack/ui/app/logs/chat-completions/page.tsx @@ -0,0 +1,51 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { ChatCompletion } from "@/lib/types"; +import { ChatCompletionsTable } from "@/components/chat-completions/chat-completions-table"; +import { client } from "@/lib/client"; + +export default function ChatCompletionsPage() { + const [completions, setCompletions] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + useEffect(() => { + const fetchCompletions = async () => { + setIsLoading(true); + setError(null); + try { + const response = await client.chat.completions.list(); + const data = Array.isArray(response) + ? response + : (response as { data: ChatCompletion[] }).data; + + if (Array.isArray(data)) { + setCompletions(data); + } else { + console.error("Unexpected response structure:", response); + setError(new Error("Unexpected response structure")); + setCompletions([]); + } + } catch (err) { + console.error("Error fetching chat completions:", err); + setError( + err instanceof Error ? err : new Error("Failed to fetch completions"), + ); + setCompletions([]); + } finally { + setIsLoading(false); + } + }; + + fetchCompletions(); + }, []); + + return ( + + ); +} diff --git a/llama_stack/ui/app/logs/responses/[id]/page.tsx b/llama_stack/ui/app/logs/responses/[id]/page.tsx new file mode 100644 index 000000000..efe6f0ff3 --- /dev/null +++ b/llama_stack/ui/app/logs/responses/[id]/page.tsx @@ -0,0 +1,125 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useParams } from "next/navigation"; +import type { ResponseObject } from "llama-stack-client/resources/responses/responses"; +import { OpenAIResponse, InputItemListResponse } from "@/lib/types"; +import { ResponseDetailView } from "@/components/responses/responses-detail"; +import { client } from "@/lib/client"; + +export default function ResponseDetailPage() { + const params = useParams(); + const id = params.id as string; + + const [responseDetail, setResponseDetail] = useState( + null, + ); + const [inputItems, setInputItems] = useState( + null, + ); + const [isLoading, setIsLoading] = useState(true); + const [isLoadingInputItems, setIsLoadingInputItems] = useState(true); + const [error, setError] = useState(null); + const [inputItemsError, setInputItemsError] = useState(null); + + // Helper function to convert ResponseObject to OpenAIResponse + const convertResponseObject = ( + responseData: ResponseObject, + ): OpenAIResponse => { + return { + id: responseData.id, + created_at: responseData.created_at, + model: responseData.model, + object: responseData.object, + status: responseData.status, + output: responseData.output as OpenAIResponse["output"], + input: [], // ResponseObject doesn't include input; component uses inputItems prop instead + error: responseData.error, + parallel_tool_calls: responseData.parallel_tool_calls, + previous_response_id: responseData.previous_response_id, + temperature: responseData.temperature, + top_p: responseData.top_p, + truncation: responseData.truncation, + user: responseData.user, + }; + }; + + useEffect(() => { + if (!id) { + setError(new Error("Response ID is missing.")); + setIsLoading(false); + return; + } + + const fetchResponseDetail = async () => { + setIsLoading(true); + setIsLoadingInputItems(true); + setError(null); + setInputItemsError(null); + setResponseDetail(null); + setInputItems(null); + + try { + const [responseResult, inputItemsResult] = await Promise.allSettled([ + client.responses.retrieve(id), + client.responses.inputItems.list(id, { order: "asc" }), + ]); + + // Handle response detail result + if (responseResult.status === "fulfilled") { + const convertedResponse = convertResponseObject(responseResult.value); + setResponseDetail(convertedResponse); + } else { + console.error( + `Error fetching response detail for ID ${id}:`, + responseResult.reason, + ); + setError( + responseResult.reason instanceof Error + ? responseResult.reason + : new Error("Failed to fetch response detail"), + ); + } + + // Handle input items result + if (inputItemsResult.status === "fulfilled") { + const inputItemsData = + inputItemsResult.value as unknown as InputItemListResponse; + setInputItems(inputItemsData); + } else { + console.error( + `Error fetching input items for response ID ${id}:`, + inputItemsResult.reason, + ); + setInputItemsError( + inputItemsResult.reason instanceof Error + ? inputItemsResult.reason + : new Error("Failed to fetch input items"), + ); + } + } catch (err) { + console.error(`Unexpected error fetching data for ID ${id}:`, err); + setError( + err instanceof Error ? err : new Error("Unexpected error occurred"), + ); + } finally { + setIsLoading(false); + setIsLoadingInputItems(false); + } + }; + + fetchResponseDetail(); + }, [id]); + + return ( + + ); +} diff --git a/llama_stack/ui/app/logs/responses/layout.tsx b/llama_stack/ui/app/logs/responses/layout.tsx new file mode 100644 index 000000000..1fe116e5e --- /dev/null +++ b/llama_stack/ui/app/logs/responses/layout.tsx @@ -0,0 +1,16 @@ +"use client"; + +import React from "react"; +import LogsLayout from "@/components/layout/logs-layout"; + +export default function ResponsesLayout({ + children, +}: { + children: React.ReactNode; +}) { + return ( + + {children} + + ); +} diff --git a/llama_stack/ui/app/logs/responses/page.tsx b/llama_stack/ui/app/logs/responses/page.tsx new file mode 100644 index 000000000..dab0c735f --- /dev/null +++ b/llama_stack/ui/app/logs/responses/page.tsx @@ -0,0 +1,66 @@ +"use client"; + +import { useEffect, useState } from "react"; +import type { ResponseListResponse } from "llama-stack-client/resources/responses/responses"; +import { OpenAIResponse } from "@/lib/types"; +import { ResponsesTable } from "@/components/responses/responses-table"; +import { client } from "@/lib/client"; + +export default function ResponsesPage() { + const [responses, setResponses] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + // Helper function to convert ResponseListResponse.Data to OpenAIResponse + const convertResponseListData = ( + responseData: ResponseListResponse.Data, + ): OpenAIResponse => { + return { + id: responseData.id, + created_at: responseData.created_at, + model: responseData.model, + object: responseData.object, + status: responseData.status, + output: responseData.output as OpenAIResponse["output"], + input: responseData.input as OpenAIResponse["input"], + error: responseData.error, + parallel_tool_calls: responseData.parallel_tool_calls, + previous_response_id: responseData.previous_response_id, + temperature: responseData.temperature, + top_p: responseData.top_p, + truncation: responseData.truncation, + user: responseData.user, + }; + }; + + useEffect(() => { + const fetchResponses = async () => { + setIsLoading(true); + setError(null); + try { + const response = await client.responses.list(); + const responseListData = response as ResponseListResponse; + + const convertedResponses: OpenAIResponse[] = responseListData.data.map( + convertResponseListData, + ); + + setResponses(convertedResponses); + } catch (err) { + console.error("Error fetching responses:", err); + setError( + err instanceof Error ? err : new Error("Failed to fetch responses"), + ); + setResponses([]); + } finally { + setIsLoading(false); + } + }; + + fetchResponses(); + }, []); + + return ( + + ); +} diff --git a/llama_stack/ui/app/page.tsx b/llama_stack/ui/app/page.tsx new file mode 100644 index 000000000..d1d781bdb --- /dev/null +++ b/llama_stack/ui/app/page.tsx @@ -0,0 +1,7 @@ +export default function Home() { + return ( +
+

Welcome to Llama Stack!

+
+ ); +} diff --git a/llama_stack/ui/components.json b/llama_stack/ui/components.json new file mode 100644 index 000000000..4ee62ee10 --- /dev/null +++ b/llama_stack/ui/components.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "new-york", + "rsc": true, + "tsx": true, + "tailwind": { + "config": "", + "css": "app/globals.css", + "baseColor": "neutral", + "cssVariables": true, + "prefix": "" + }, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "iconLibrary": "lucide" +} diff --git a/llama_stack/ui/components/chat-completions/chat-completion-detail.test.tsx b/llama_stack/ui/components/chat-completions/chat-completion-detail.test.tsx new file mode 100644 index 000000000..5348dbc3a --- /dev/null +++ b/llama_stack/ui/components/chat-completions/chat-completion-detail.test.tsx @@ -0,0 +1,193 @@ +import React from "react"; +import { render, screen } from "@testing-library/react"; +import "@testing-library/jest-dom"; +import { ChatCompletionDetailView } from "./chat-completion-detail"; +import { ChatCompletion } from "@/lib/types"; + +// Initial test file setup for ChatCompletionDetailView + +describe("ChatCompletionDetailView", () => { + test("renders skeleton UI when isLoading is true", () => { + const { container } = render( + , + ); + // Use the data-slot attribute for Skeletons + const skeletons = container.querySelectorAll('[data-slot="skeleton"]'); + expect(skeletons.length).toBeGreaterThan(0); + }); + + test("renders error message when error prop is provided", () => { + render( + , + ); + expect( + screen.getByText(/Error loading details for ID err-id: Network Error/), + ).toBeInTheDocument(); + }); + + test("renders default error message when error.message is empty", () => { + render( + , + ); + // Use regex to match the error message regardless of whitespace + expect( + screen.getByText(/Error loading details for ID\s*err-id\s*:/), + ).toBeInTheDocument(); + }); + + test("renders error message when error prop is an object without message", () => { + render( + , + ); + // Use regex to match the error message regardless of whitespace + expect( + screen.getByText(/Error loading details for ID\s*err-id\s*:/), + ).toBeInTheDocument(); + }); + + test("renders not found message when completion is null and not loading/error", () => { + render( + , + ); + expect( + screen.getByText("No details found for ID: notfound-id."), + ).toBeInTheDocument(); + }); + + test("renders input, output, and properties for valid completion", () => { + const mockCompletion: ChatCompletion = { + id: "comp_123", + object: "chat.completion", + created: 1710000000, + model: "llama-test-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Test output" }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Test input" }], + }; + render( + , + ); + // Input + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Test input")).toBeInTheDocument(); + // Output + expect(screen.getByText("Output")).toBeInTheDocument(); + expect(screen.getByText("Test output")).toBeInTheDocument(); + // Properties + expect(screen.getByText("Properties")).toBeInTheDocument(); + expect(screen.getByText("Created:")).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710000000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + expect(screen.getByText("ID:")).toBeInTheDocument(); + expect(screen.getByText("comp_123")).toBeInTheDocument(); + expect(screen.getByText("Model:")).toBeInTheDocument(); + expect(screen.getByText("llama-test-model")).toBeInTheDocument(); + expect(screen.getByText("Finish Reason:")).toBeInTheDocument(); + expect(screen.getByText("stop")).toBeInTheDocument(); + }); + + test("renders tool call in output and properties when present", () => { + const toolCall = { + function: { name: "search", arguments: '{"query":"llama"}' }, + }; + const mockCompletion: ChatCompletion = { + id: "comp_tool", + object: "chat.completion", + created: 1710001000, + model: "llama-tool-model", + choices: [ + { + index: 0, + message: { + role: "assistant", + content: "Tool output", + tool_calls: [toolCall], + }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Tool input" }], + }; + render( + , + ); + // Output should include the tool call block (should be present twice: input and output) + const toolCallLabels = screen.getAllByText("Tool Call"); + expect(toolCallLabels.length).toBeGreaterThanOrEqual(1); // At least one, but could be two + // The tool call block should contain the formatted tool call string in both input and output + const toolCallBlocks = screen.getAllByText('search({"query":"llama"})'); + expect(toolCallBlocks.length).toBe(2); + // Properties should include the tool call name + expect(screen.getByText("Functions/Tools Called:")).toBeInTheDocument(); + expect(screen.getByText("search")).toBeInTheDocument(); + }); + + test("handles missing/empty fields gracefully", () => { + const mockCompletion: ChatCompletion = { + id: "comp_edge", + object: "chat.completion", + created: 1710002000, + model: "llama-edge-model", + choices: [], // No choices + input_messages: [], // No input messages + }; + render( + , + ); + // Input section should be present but empty + expect(screen.getByText("Input")).toBeInTheDocument(); + // Output section should show fallback message + expect( + screen.getByText("No message found in assistant's choice."), + ).toBeInTheDocument(); + // Properties should show N/A for finish reason + expect(screen.getByText("Finish Reason:")).toBeInTheDocument(); + expect(screen.getByText("N/A")).toBeInTheDocument(); + }); +}); diff --git a/llama_stack/ui/components/chat-completions/chat-completion-detail.tsx b/llama_stack/ui/components/chat-completions/chat-completion-detail.tsx new file mode 100644 index 000000000..200807864 --- /dev/null +++ b/llama_stack/ui/components/chat-completions/chat-completion-detail.tsx @@ -0,0 +1,145 @@ +"use client"; + +import { ChatMessage, ChatCompletion } from "@/lib/types"; +import { ChatMessageItem } from "@/components/chat-completions/chat-messasge-item"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { + DetailLoadingView, + DetailErrorView, + DetailNotFoundView, + DetailLayout, + PropertiesCard, + PropertyItem, +} from "@/components/layout/detail-layout"; + +interface ChatCompletionDetailViewProps { + completion: ChatCompletion | null; + isLoading: boolean; + error: Error | null; + id: string; +} + +export function ChatCompletionDetailView({ + completion, + isLoading, + error, + id, +}: ChatCompletionDetailViewProps) { + const title = "Chat Completion Details"; + + if (error) { + return ; + } + + if (isLoading) { + return ; + } + + if (!completion) { + return ; + } + + // Main content cards + const mainContent = ( + <> + + + Input + + + {completion.input_messages?.map((msg, index) => ( + + ))} + {completion.choices?.[0]?.message?.tool_calls && + Array.isArray(completion.choices[0].message.tool_calls) && + !completion.input_messages?.some( + (im) => + im.role === "assistant" && + im.tool_calls && + Array.isArray(im.tool_calls) && + im.tool_calls.length > 0, + ) + ? completion.choices[0].message.tool_calls.map( + (toolCall: any, index: number) => { + const assistantToolCallMessage: ChatMessage = { + role: "assistant", + tool_calls: [toolCall], + content: "", // Ensure content is defined, even if empty + }; + return ( + + ); + }, + ) + : null} + + + + + + Output + + + {completion.choices?.[0]?.message ? ( + + ) : ( +

+ No message found in assistant's choice. +

+ )} +
+
+ + ); + + // Properties sidebar + const sidebar = ( + + + + + + {(() => { + const toolCalls = completion.choices?.[0]?.message?.tool_calls; + if (toolCalls && Array.isArray(toolCalls) && toolCalls.length > 0) { + return ( + +
    + {toolCalls.map((toolCall: any, index: number) => ( +
  • + + {toolCall.function?.name || "N/A"} + +
  • + ))} +
+ + } + hasBorder + /> + ); + } + return null; + })()} +
+ ); + + return ( + + ); +} diff --git a/llama_stack/ui/components/chat-completions/chat-completion-table.test.tsx b/llama_stack/ui/components/chat-completions/chat-completion-table.test.tsx new file mode 100644 index 000000000..c8a55b100 --- /dev/null +++ b/llama_stack/ui/components/chat-completions/chat-completion-table.test.tsx @@ -0,0 +1,347 @@ +import React from "react"; +import { render, screen, fireEvent } from "@testing-library/react"; +import "@testing-library/jest-dom"; +import { ChatCompletionsTable } from "./chat-completions-table"; +import { ChatCompletion } from "@/lib/types"; + +// Mock next/navigation +const mockPush = jest.fn(); +jest.mock("next/navigation", () => ({ + useRouter: () => ({ + push: mockPush, + }), +})); + +// Mock helper functions +jest.mock("@/lib/truncate-text"); +jest.mock("@/lib/format-message-content"); + +// Import the mocked functions to set up default or specific implementations +import { truncateText as originalTruncateText } from "@/lib/truncate-text"; +import { + extractTextFromContentPart as originalExtractTextFromContentPart, + extractDisplayableText as originalExtractDisplayableText, +} from "@/lib/format-message-content"; + +// Cast to jest.Mock for typings +const truncateText = originalTruncateText as jest.Mock; +const extractTextFromContentPart = + originalExtractTextFromContentPart as jest.Mock; +const extractDisplayableText = originalExtractDisplayableText as jest.Mock; + +describe("ChatCompletionsTable", () => { + const defaultProps = { + data: [] as ChatCompletion[], + isLoading: false, + error: null, + }; + + beforeEach(() => { + // Reset all mocks before each test + mockPush.mockClear(); + truncateText.mockClear(); + extractTextFromContentPart.mockClear(); + extractDisplayableText.mockClear(); + + // Default pass-through implementations + truncateText.mockImplementation((text: string | undefined) => text); + extractTextFromContentPart.mockImplementation((content: unknown) => + typeof content === "string" ? content : "extracted text", + ); + extractDisplayableText.mockImplementation( + (message: unknown) => + (message as { content?: string })?.content || "extracted output", + ); + }); + + test("renders without crashing with default props", () => { + render(); + expect(screen.getByText("No chat completions found.")).toBeInTheDocument(); + }); + + test("click on a row navigates to the correct URL", () => { + const mockCompletion: ChatCompletion = { + id: "comp_123", + object: "chat.completion", + created: Math.floor(Date.now() / 1000), + model: "llama-test-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Test output" }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Test input" }], + }; + + // Set up mocks to return expected values + extractTextFromContentPart.mockReturnValue("Test input"); + extractDisplayableText.mockReturnValue("Test output"); + + render(); + + const row = screen.getByText("Test input").closest("tr"); + if (row) { + fireEvent.click(row); + expect(mockPush).toHaveBeenCalledWith("/logs/chat-completions/comp_123"); + } else { + throw new Error('Row with "Test input" not found for router mock test.'); + } + }); + + describe("Loading State", () => { + test("renders skeleton UI when isLoading is true", () => { + const { container } = render( + , + ); + + // Check for skeleton in the table caption + const tableCaption = container.querySelector("caption"); + expect(tableCaption).toBeInTheDocument(); + if (tableCaption) { + const captionSkeleton = tableCaption.querySelector( + '[data-slot="skeleton"]', + ); + expect(captionSkeleton).toBeInTheDocument(); + } + + // Check for skeletons in the table body cells + const tableBody = container.querySelector("tbody"); + expect(tableBody).toBeInTheDocument(); + if (tableBody) { + const bodySkeletons = tableBody.querySelectorAll( + '[data-slot="skeleton"]', + ); + expect(bodySkeletons.length).toBeGreaterThan(0); + } + }); + }); + + describe("Error State", () => { + test("renders error message when error prop is provided", () => { + const errorMessage = "Network Error"; + render( + , + ); + expect( + screen.getByText(`Error fetching data: ${errorMessage}`), + ).toBeInTheDocument(); + }); + + test("renders default error message when error.message is not available", () => { + render( + , + ); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + + test("renders default error message when error prop is an object without message", () => { + render(); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + }); + + describe("Empty State", () => { + test('renders "No chat completions found." and no table when data array is empty', () => { + render(); + expect( + screen.getByText("No chat completions found."), + ).toBeInTheDocument(); + + // Ensure that the table structure is NOT rendered in the empty state + const table = screen.queryByRole("table"); + expect(table).not.toBeInTheDocument(); + }); + }); + + describe("Data Rendering", () => { + test("renders table caption, headers, and completion data correctly", () => { + const mockCompletions = [ + { + id: "comp_1", + object: "chat.completion", + created: 1710000000, + model: "llama-test-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Test output" }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Test input" }], + }, + { + id: "comp_2", + object: "chat.completion", + created: 1710001000, + model: "llama-another-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Another output" }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Another input" }], + }, + ]; + + // Set up mocks to return expected values + extractTextFromContentPart.mockImplementation((content: unknown) => { + if (content === "Test input") return "Test input"; + if (content === "Another input") return "Another input"; + return "extracted text"; + }); + extractDisplayableText.mockImplementation((message: unknown) => { + const msg = message as { content?: string }; + if (msg?.content === "Test output") return "Test output"; + if (msg?.content === "Another output") return "Another output"; + return "extracted output"; + }); + + render( + , + ); + + // Table caption + expect( + screen.getByText("A list of your recent chat completions."), + ).toBeInTheDocument(); + + // Table headers + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Output")).toBeInTheDocument(); + expect(screen.getByText("Model")).toBeInTheDocument(); + expect(screen.getByText("Created")).toBeInTheDocument(); + + // Data rows + expect(screen.getByText("Test input")).toBeInTheDocument(); + expect(screen.getByText("Test output")).toBeInTheDocument(); + expect(screen.getByText("llama-test-model")).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710000000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + + expect(screen.getByText("Another input")).toBeInTheDocument(); + expect(screen.getByText("Another output")).toBeInTheDocument(); + expect(screen.getByText("llama-another-model")).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710001000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + }); + }); + + describe("Text Truncation and Content Extraction", () => { + test("truncates long input and output text", () => { + // Specific mock implementation for this test + truncateText.mockImplementation( + (text: string | undefined, maxLength?: number) => { + const defaultTestMaxLength = 10; + const effectiveMaxLength = maxLength ?? defaultTestMaxLength; + return typeof text === "string" && text.length > effectiveMaxLength + ? text.slice(0, effectiveMaxLength) + "..." + : text; + }, + ); + + const longInput = + "This is a very long input message that should be truncated."; + const longOutput = + "This is a very long output message that should also be truncated."; + + extractTextFromContentPart.mockReturnValue(longInput); + extractDisplayableText.mockReturnValue(longOutput); + + const mockCompletions = [ + { + id: "comp_trunc", + object: "chat.completion", + created: 1710002000, + model: "llama-trunc-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: longOutput }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: longInput }], + }, + ]; + + render( + , + ); + + // The truncated text should be present for both input and output + const truncatedTexts = screen.getAllByText( + longInput.slice(0, 10) + "...", + ); + expect(truncatedTexts.length).toBe(2); // one for input, one for output + truncatedTexts.forEach((textElement) => + expect(textElement).toBeInTheDocument(), + ); + }); + + test("uses content extraction functions correctly", () => { + const mockCompletion = { + id: "comp_extract", + object: "chat.completion", + created: 1710003000, + model: "llama-extract-model", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Extracted output" }, + finish_reason: "stop", + }, + ], + input_messages: [{ role: "user", content: "Extracted input" }], + }; + + extractTextFromContentPart.mockReturnValue("Extracted input"); + extractDisplayableText.mockReturnValue("Extracted output"); + + render( + , + ); + + // Verify the extraction functions were called + expect(extractTextFromContentPart).toHaveBeenCalledWith( + "Extracted input", + ); + expect(extractDisplayableText).toHaveBeenCalledWith({ + role: "assistant", + content: "Extracted output", + }); + + // Verify the extracted content is displayed + expect(screen.getByText("Extracted input")).toBeInTheDocument(); + expect(screen.getByText("Extracted output")).toBeInTheDocument(); + }); + }); +}); diff --git a/llama_stack/ui/components/chat-completions/chat-completions-table.tsx b/llama_stack/ui/components/chat-completions/chat-completions-table.tsx new file mode 100644 index 000000000..5f1d2f03d --- /dev/null +++ b/llama_stack/ui/components/chat-completions/chat-completions-table.tsx @@ -0,0 +1,43 @@ +"use client"; + +import { ChatCompletion } from "@/lib/types"; +import { LogsTable, LogTableRow } from "@/components/logs/logs-table"; +import { + extractTextFromContentPart, + extractDisplayableText, +} from "@/lib/format-message-content"; + +interface ChatCompletionsTableProps { + data: ChatCompletion[]; + isLoading: boolean; + error: Error | null; +} + +function formatChatCompletionToRow(completion: ChatCompletion): LogTableRow { + return { + id: completion.id, + input: extractTextFromContentPart(completion.input_messages?.[0]?.content), + output: extractDisplayableText(completion.choices?.[0]?.message), + model: completion.model, + createdTime: new Date(completion.created * 1000).toLocaleString(), + detailPath: `/logs/chat-completions/${completion.id}`, + }; +} + +export function ChatCompletionsTable({ + data, + isLoading, + error, +}: ChatCompletionsTableProps) { + const formattedData = data.map(formatChatCompletionToRow); + + return ( + + ); +} diff --git a/llama_stack/ui/components/chat-completions/chat-messasge-item.tsx b/llama_stack/ui/components/chat-completions/chat-messasge-item.tsx new file mode 100644 index 000000000..2e8593bfb --- /dev/null +++ b/llama_stack/ui/components/chat-completions/chat-messasge-item.tsx @@ -0,0 +1,76 @@ +"use client"; + +import { ChatMessage } from "@/lib/types"; +import React from "react"; +import { formatToolCallToString } from "@/lib/format-tool-call"; +import { extractTextFromContentPart } from "@/lib/format-message-content"; +import { + MessageBlock, + ToolCallBlock, +} from "@/components/ui/message-components"; + +interface ChatMessageItemProps { + message: ChatMessage; +} +export function ChatMessageItem({ message }: ChatMessageItemProps) { + switch (message.role) { + case "system": + return ( + + ); + case "user": + return ( + + ); + + case "assistant": + if ( + message.tool_calls && + Array.isArray(message.tool_calls) && + message.tool_calls.length > 0 + ) { + return ( + <> + {message.tool_calls.map((toolCall: any, index: number) => { + const formattedToolCall = formatToolCallToString(toolCall); + const toolCallContent = ( + + {formattedToolCall || "Error: Could not display tool call"} + + ); + return ( + + ); + })} + + ); + } else { + return ( + + ); + } + case "tool": + const toolOutputContent = ( + + {extractTextFromContentPart(message.content)} + + ); + return ( + + ); + } + return null; +} diff --git a/llama_stack/ui/components/layout/app-sidebar.tsx b/llama_stack/ui/components/layout/app-sidebar.tsx new file mode 100644 index 000000000..1c53d6cc5 --- /dev/null +++ b/llama_stack/ui/components/layout/app-sidebar.tsx @@ -0,0 +1,82 @@ +"use client"; + +import { MessageSquareText, MessagesSquare, MoveUpRight } from "lucide-react"; +import Link from "next/link"; +import { usePathname } from "next/navigation"; +import { cn } from "@/lib/utils"; + +import { + Sidebar, + SidebarContent, + SidebarGroup, + SidebarGroupContent, + SidebarGroupLabel, + SidebarMenu, + SidebarMenuButton, + SidebarMenuItem, + SidebarHeader, +} from "@/components/ui/sidebar"; + +const logItems = [ + { + title: "Chat Completions", + url: "/logs/chat-completions", + icon: MessageSquareText, + }, + { + title: "Responses", + url: "/logs/responses", + icon: MessagesSquare, + }, + { + title: "Documentation", + url: "https://llama-stack.readthedocs.io/en/latest/references/api_reference/index.html", + icon: MoveUpRight, + }, +]; + +export function AppSidebar() { + const pathname = usePathname(); + + return ( + + + Llama Stack + + + + Logs + + + {logItems.map((item) => { + const isActive = pathname.startsWith(item.url); + return ( + + + + + {item.title} + + + + ); + })} + + + + + + ); +} diff --git a/llama_stack/ui/components/layout/detail-layout.tsx b/llama_stack/ui/components/layout/detail-layout.tsx new file mode 100644 index 000000000..58b912703 --- /dev/null +++ b/llama_stack/ui/components/layout/detail-layout.tsx @@ -0,0 +1,141 @@ +import React from "react"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Skeleton } from "@/components/ui/skeleton"; + +export function DetailLoadingView({ title }: { title: string }) { + return ( + <> + {/* Title Skeleton */} +
+
+ {[...Array(2)].map((_, i) => ( + + + + + + + + + + + + + ))} +
+
+
+ {" "} + {/* Properties Title Skeleton */} + {[...Array(5)].map((_, i) => ( +
+ + +
+ ))} +
+
+
+ + ); +} + +export function DetailErrorView({ + title, + id, + error, +}: { + title: string; + id: string; + error: Error; +}) { + return ( + <> +

{title}

+

+ Error loading details for ID {id}: {error.message} +

+ + ); +} + +export function DetailNotFoundView({ + title, + id, +}: { + title: string; + id: string; +}) { + return ( + <> +

{title}

+

No details found for ID: {id}.

+ + ); +} + +export interface PropertyItemProps { + label: string; + value: React.ReactNode; + className?: string; + hasBorder?: boolean; +} + +export function PropertyItem({ + label, + value, + className = "", + hasBorder = false, +}: PropertyItemProps) { + return ( +
  • + {label}:{" "} + {typeof value === "string" || typeof value === "number" ? ( + {value} + ) : ( + value + )} +
  • + ); +} + +export interface PropertiesCardProps { + children: React.ReactNode; +} + +export function PropertiesCard({ children }: PropertiesCardProps) { + return ( + + + Properties + + +
      {children}
    +
    +
    + ); +} + +export interface DetailLayoutProps { + title: string; + mainContent: React.ReactNode; + sidebar: React.ReactNode; +} + +export function DetailLayout({ + title, + mainContent, + sidebar, +}: DetailLayoutProps) { + return ( + <> +

    {title}

    +
    +
    {mainContent}
    +
    {sidebar}
    +
    + + ); +} diff --git a/llama_stack/ui/components/layout/logs-layout.tsx b/llama_stack/ui/components/layout/logs-layout.tsx new file mode 100644 index 000000000..468ad6e9a --- /dev/null +++ b/llama_stack/ui/components/layout/logs-layout.tsx @@ -0,0 +1,49 @@ +"use client"; + +import React from "react"; +import { usePathname, useParams } from "next/navigation"; +import { + PageBreadcrumb, + BreadcrumbSegment, +} from "@/components/layout/page-breadcrumb"; +import { truncateText } from "@/lib/truncate-text"; + +interface LogsLayoutProps { + children: React.ReactNode; + sectionLabel: string; + basePath: string; +} + +export default function LogsLayout({ + children, + sectionLabel, + basePath, +}: LogsLayoutProps) { + const pathname = usePathname(); + const params = useParams(); + + let segments: BreadcrumbSegment[] = []; + + if (pathname === basePath) { + segments = [{ label: sectionLabel }]; + } + + const idParam = params?.id; + if (idParam && typeof idParam === "string") { + segments = [ + { label: sectionLabel, href: basePath }, + { label: `Details (${truncateText(idParam, 20)})` }, + ]; + } + + return ( +
    + <> + {segments.length > 0 && ( + + )} + {children} + +
    + ); +} diff --git a/llama_stack/ui/components/layout/page-breadcrumb.tsx b/llama_stack/ui/components/layout/page-breadcrumb.tsx new file mode 100644 index 000000000..fdb561d68 --- /dev/null +++ b/llama_stack/ui/components/layout/page-breadcrumb.tsx @@ -0,0 +1,49 @@ +"use client"; + +import Link from "next/link"; +import React from "react"; +import { + Breadcrumb, + BreadcrumbItem, + BreadcrumbLink, + BreadcrumbList, + BreadcrumbPage, + BreadcrumbSeparator, +} from "@/components/ui/breadcrumb"; + +export interface BreadcrumbSegment { + label: string; + href?: string; +} + +interface PageBreadcrumbProps { + segments: BreadcrumbSegment[]; + className?: string; +} + +export function PageBreadcrumb({ segments, className }: PageBreadcrumbProps) { + if (!segments || segments.length === 0) { + return null; + } + + return ( + + + {segments.map((segment, index) => ( + + + {segment.href ? ( + + {segment.label} + + ) : ( + {segment.label} + )} + + {index < segments.length - 1 && } + + ))} + + + ); +} diff --git a/llama_stack/ui/components/logs/logs-table.test.tsx b/llama_stack/ui/components/logs/logs-table.test.tsx new file mode 100644 index 000000000..88263b2fc --- /dev/null +++ b/llama_stack/ui/components/logs/logs-table.test.tsx @@ -0,0 +1,350 @@ +import React from "react"; +import { render, screen, fireEvent } from "@testing-library/react"; +import "@testing-library/jest-dom"; +import { LogsTable, LogTableRow } from "./logs-table"; + +// Mock next/navigation +const mockPush = jest.fn(); +jest.mock("next/navigation", () => ({ + useRouter: () => ({ + push: mockPush, + }), +})); + +// Mock helper functions +jest.mock("@/lib/truncate-text"); + +// Import the mocked functions +import { truncateText as originalTruncateText } from "@/lib/truncate-text"; + +// Cast to jest.Mock for typings +const truncateText = originalTruncateText as jest.Mock; + +describe("LogsTable", () => { + const defaultProps = { + data: [] as LogTableRow[], + isLoading: false, + error: null, + caption: "Test table caption", + emptyMessage: "No data found", + }; + + beforeEach(() => { + // Reset all mocks before each test + mockPush.mockClear(); + truncateText.mockClear(); + + // Default pass-through implementation + truncateText.mockImplementation((text: string | undefined) => text); + }); + + test("renders without crashing with default props", () => { + render(); + expect(screen.getByText("No data found")).toBeInTheDocument(); + }); + + test("click on a row navigates to the correct URL", () => { + const mockData: LogTableRow[] = [ + { + id: "row_123", + input: "Test input", + output: "Test output", + model: "test-model", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path/row_123", + }, + ]; + + render(); + + const row = screen.getByText("Test input").closest("tr"); + if (row) { + fireEvent.click(row); + expect(mockPush).toHaveBeenCalledWith("/test/path/row_123"); + } else { + throw new Error('Row with "Test input" not found for router mock test.'); + } + }); + + describe("Loading State", () => { + test("renders skeleton UI when isLoading is true", () => { + const { container } = render( + , + ); + + // Check for skeleton in the table caption + const tableCaption = container.querySelector("caption"); + expect(tableCaption).toBeInTheDocument(); + if (tableCaption) { + const captionSkeleton = tableCaption.querySelector( + '[data-slot="skeleton"]', + ); + expect(captionSkeleton).toBeInTheDocument(); + } + + // Check for skeletons in the table body cells + const tableBody = container.querySelector("tbody"); + expect(tableBody).toBeInTheDocument(); + if (tableBody) { + const bodySkeletons = tableBody.querySelectorAll( + '[data-slot="skeleton"]', + ); + expect(bodySkeletons.length).toBeGreaterThan(0); + } + + // Check that table headers are still rendered + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Output")).toBeInTheDocument(); + expect(screen.getByText("Model")).toBeInTheDocument(); + expect(screen.getByText("Created")).toBeInTheDocument(); + }); + + test("renders correct number of skeleton rows", () => { + const { container } = render( + , + ); + + const skeletonRows = container.querySelectorAll("tbody tr"); + expect(skeletonRows.length).toBe(3); // Should render 3 skeleton rows + }); + }); + + describe("Error State", () => { + test("renders error message when error prop is provided", () => { + const errorMessage = "Network Error"; + render( + , + ); + expect( + screen.getByText(`Error fetching data: ${errorMessage}`), + ).toBeInTheDocument(); + }); + + test("renders default error message when error.message is not available", () => { + render( + , + ); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + + test("renders default error message when error prop is an object without message", () => { + render(); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + + test("does not render table when in error state", () => { + render( + , + ); + const table = screen.queryByRole("table"); + expect(table).not.toBeInTheDocument(); + }); + }); + + describe("Empty State", () => { + test("renders custom empty message when data array is empty", () => { + render( + , + ); + expect(screen.getByText("Custom empty message")).toBeInTheDocument(); + + // Ensure that the table structure is NOT rendered in the empty state + const table = screen.queryByRole("table"); + expect(table).not.toBeInTheDocument(); + }); + }); + + describe("Data Rendering", () => { + test("renders table caption, headers, and data correctly", () => { + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: "First input", + output: "First output", + model: "model-1", + createdTime: "2024-01-01 12:00:00", + detailPath: "/path/1", + }, + { + id: "row_2", + input: "Second input", + output: "Second output", + model: "model-2", + createdTime: "2024-01-02 13:00:00", + detailPath: "/path/2", + }, + ]; + + render( + , + ); + + // Table caption + expect(screen.getByText("Custom table caption")).toBeInTheDocument(); + + // Table headers + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Output")).toBeInTheDocument(); + expect(screen.getByText("Model")).toBeInTheDocument(); + expect(screen.getByText("Created")).toBeInTheDocument(); + + // Data rows + expect(screen.getByText("First input")).toBeInTheDocument(); + expect(screen.getByText("First output")).toBeInTheDocument(); + expect(screen.getByText("model-1")).toBeInTheDocument(); + expect(screen.getByText("2024-01-01 12:00:00")).toBeInTheDocument(); + + expect(screen.getByText("Second input")).toBeInTheDocument(); + expect(screen.getByText("Second output")).toBeInTheDocument(); + expect(screen.getByText("model-2")).toBeInTheDocument(); + expect(screen.getByText("2024-01-02 13:00:00")).toBeInTheDocument(); + }); + + test("applies correct CSS classes to table rows", () => { + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: "Test input", + output: "Test output", + model: "test-model", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path", + }, + ]; + + render(); + + const row = screen.getByText("Test input").closest("tr"); + expect(row).toHaveClass("cursor-pointer"); + expect(row).toHaveClass("hover:bg-muted/50"); + }); + + test("applies correct alignment to Created column", () => { + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: "Test input", + output: "Test output", + model: "test-model", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path", + }, + ]; + + render(); + + const createdCell = screen.getByText("2024-01-01 12:00:00").closest("td"); + expect(createdCell).toHaveClass("text-right"); + }); + }); + + describe("Text Truncation", () => { + test("truncates input and output text using truncateText function", () => { + // Mock truncateText to return truncated versions + truncateText.mockImplementation((text: string | undefined) => { + if (typeof text === "string" && text.length > 10) { + return text.slice(0, 10) + "..."; + } + return text; + }); + + const longInput = + "This is a very long input text that should be truncated"; + const longOutput = + "This is a very long output text that should be truncated"; + + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: longInput, + output: longOutput, + model: "test-model", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path", + }, + ]; + + render(); + + // Verify truncateText was called + expect(truncateText).toHaveBeenCalledWith(longInput); + expect(truncateText).toHaveBeenCalledWith(longOutput); + + // Verify truncated text is displayed + const truncatedTexts = screen.getAllByText("This is a ..."); + expect(truncatedTexts).toHaveLength(2); // one for input, one for output + truncatedTexts.forEach((textElement) => + expect(textElement).toBeInTheDocument(), + ); + }); + + test("does not truncate model names", () => { + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: "Test input", + output: "Test output", + model: "very-long-model-name-that-should-not-be-truncated", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path", + }, + ]; + + render(); + + // Model name should not be passed to truncateText + expect(truncateText).not.toHaveBeenCalledWith( + "very-long-model-name-that-should-not-be-truncated", + ); + + // Full model name should be displayed + expect( + screen.getByText("very-long-model-name-that-should-not-be-truncated"), + ).toBeInTheDocument(); + }); + }); + + describe("Accessibility", () => { + test("table has proper role and structure", () => { + const mockData: LogTableRow[] = [ + { + id: "row_1", + input: "Test input", + output: "Test output", + model: "test-model", + createdTime: "2024-01-01 12:00:00", + detailPath: "/test/path", + }, + ]; + + render(); + + const table = screen.getByRole("table"); + expect(table).toBeInTheDocument(); + + const columnHeaders = screen.getAllByRole("columnheader"); + expect(columnHeaders).toHaveLength(4); + + const rows = screen.getAllByRole("row"); + expect(rows).toHaveLength(2); // 1 header row + 1 data row + }); + }); +}); diff --git a/llama_stack/ui/components/logs/logs-table.tsx b/llama_stack/ui/components/logs/logs-table.tsx new file mode 100644 index 000000000..33afea61b --- /dev/null +++ b/llama_stack/ui/components/logs/logs-table.tsx @@ -0,0 +1,113 @@ +"use client"; + +import { useRouter } from "next/navigation"; +import { truncateText } from "@/lib/truncate-text"; +import { + Table, + TableBody, + TableCaption, + TableCell, + TableHead, + TableHeader, + TableRow, +} from "@/components/ui/table"; +import { Skeleton } from "@/components/ui/skeleton"; + +// Generic table row data interface +export interface LogTableRow { + id: string; + input: string; + output: string; + model: string; + createdTime: string; + detailPath: string; +} + +interface LogsTableProps { + data: LogTableRow[]; + isLoading: boolean; + error: Error | null; + caption: string; + emptyMessage: string; +} + +export function LogsTable({ + data, + isLoading, + error, + caption, + emptyMessage, +}: LogsTableProps) { + const router = useRouter(); + + const tableHeader = ( + + + Input + Output + Model + Created + + + ); + + if (isLoading) { + return ( + + + + + {tableHeader} + + {[...Array(3)].map((_, i) => ( + + + + + + + + + + + + + + + ))} + +
    + ); + } + + if (error) { + return ( +

    Error fetching data: {error.message || "An unknown error occurred"}

    + ); + } + + if (data.length === 0) { + return

    {emptyMessage}

    ; + } + + return ( + + {caption} + {tableHeader} + + {data.map((row) => ( + router.push(row.detailPath)} + className="cursor-pointer hover:bg-muted/50" + > + {truncateText(row.input)} + {truncateText(row.output)} + {row.model} + {row.createdTime} + + ))} + +
    + ); +} diff --git a/llama_stack/ui/components/responses/grouping/grouped-items-display.tsx b/llama_stack/ui/components/responses/grouping/grouped-items-display.tsx new file mode 100644 index 000000000..6ddc0eacc --- /dev/null +++ b/llama_stack/ui/components/responses/grouping/grouped-items-display.tsx @@ -0,0 +1,56 @@ +import { useFunctionCallGrouping } from "../hooks/function-call-grouping"; +import { ItemRenderer } from "../items/item-renderer"; +import { GroupedFunctionCallItemComponent } from "../items/grouped-function-call-item"; +import { + isFunctionCallItem, + isFunctionCallOutputItem, + AnyResponseItem, +} from "../utils/item-types"; + +interface GroupedItemsDisplayProps { + items: AnyResponseItem[]; + keyPrefix: string; + defaultRole?: string; +} + +export function GroupedItemsDisplay({ + items, + keyPrefix, + defaultRole = "unknown", +}: GroupedItemsDisplayProps) { + const groupedItems = useFunctionCallGrouping(items); + + return ( + <> + {groupedItems.map((groupedItem) => { + // If this is a function call with an output, render the grouped component + if ( + groupedItem.outputItem && + isFunctionCallItem(groupedItem.item) && + isFunctionCallOutputItem(groupedItem.outputItem) + ) { + return ( + + ); + } + + // Otherwise, render the individual item + return ( + + ); + })} + + ); +} diff --git a/llama_stack/ui/components/responses/hooks/function-call-grouping.ts b/llama_stack/ui/components/responses/hooks/function-call-grouping.ts new file mode 100644 index 000000000..2994354d5 --- /dev/null +++ b/llama_stack/ui/components/responses/hooks/function-call-grouping.ts @@ -0,0 +1,92 @@ +import { useMemo } from "react"; +import { + isFunctionCallOutputItem, + AnyResponseItem, + FunctionCallOutputItem, +} from "../utils/item-types"; + +export interface GroupedItem { + item: AnyResponseItem; + index: number; + outputItem?: AnyResponseItem; + outputIndex?: number; +} + +/** + * Hook to group function calls with their corresponding outputs + * @param items Array of items to group + * @returns Array of grouped items with their outputs + */ +export function useFunctionCallGrouping( + items: AnyResponseItem[], +): GroupedItem[] { + return useMemo(() => { + const groupedItems: GroupedItem[] = []; + const processedIndices = new Set(); + + // Build a map of call_id to indices for function_call_output items + const callIdToIndices = new Map(); + + for (let i = 0; i < items.length; i++) { + const item = items[i]; + if (isFunctionCallOutputItem(item)) { + if (!callIdToIndices.has(item.call_id)) { + callIdToIndices.set(item.call_id, []); + } + callIdToIndices.get(item.call_id)!.push(i); + } + } + + // Process items and group function calls with their outputs + for (let i = 0; i < items.length; i++) { + if (processedIndices.has(i)) { + continue; + } + + const currentItem = items[i]; + + if ( + currentItem.type === "function_call" && + "name" in currentItem && + "call_id" in currentItem + ) { + const functionCallId = currentItem.call_id as string; + let outputIndex = -1; + let outputItem: FunctionCallOutputItem | null = null; + + const relatedIndices = callIdToIndices.get(functionCallId) || []; + for (const idx of relatedIndices) { + const potentialOutput = items[idx]; + outputIndex = idx; + outputItem = potentialOutput as FunctionCallOutputItem; + break; + } + + if (outputItem && outputIndex !== -1) { + // Group function call with its function_call_output + groupedItems.push({ + item: currentItem, + index: i, + outputItem, + outputIndex, + }); + + // Mark both items as processed + processedIndices.add(i); + processedIndices.add(outputIndex); + + // Matching function call and output found, skip to next item + continue; + } + } + // render normally + groupedItems.push({ + item: currentItem, + index: i, + }); + processedIndices.add(i); + } + + return groupedItems; + }, [items]); +} diff --git a/llama_stack/ui/components/responses/items/function-call-item.tsx b/llama_stack/ui/components/responses/items/function-call-item.tsx new file mode 100644 index 000000000..beca935f0 --- /dev/null +++ b/llama_stack/ui/components/responses/items/function-call-item.tsx @@ -0,0 +1,29 @@ +import { + MessageBlock, + ToolCallBlock, +} from "@/components/ui/message-components"; +import { FunctionCallItem } from "../utils/item-types"; + +interface FunctionCallItemProps { + item: FunctionCallItem; + index: number; + keyPrefix: string; +} + +export function FunctionCallItemComponent({ + item, + index, + keyPrefix, +}: FunctionCallItemProps) { + const name = item.name || "unknown"; + const args = item.arguments || "{}"; + const formattedFunctionCall = `${name}(${args})`; + + return ( + {formattedFunctionCall}} + /> + ); +} diff --git a/llama_stack/ui/components/responses/items/generic-item.tsx b/llama_stack/ui/components/responses/items/generic-item.tsx new file mode 100644 index 000000000..6b6f56603 --- /dev/null +++ b/llama_stack/ui/components/responses/items/generic-item.tsx @@ -0,0 +1,37 @@ +import { + MessageBlock, + ToolCallBlock, +} from "@/components/ui/message-components"; +import { BaseItem } from "../utils/item-types"; + +interface GenericItemProps { + item: BaseItem; + index: number; + keyPrefix: string; +} + +export function GenericItemComponent({ + item, + index, + keyPrefix, +}: GenericItemProps) { + // Handle other types like function calls, tool outputs, etc. + const itemData = item as Record; + + const content = itemData.content + ? typeof itemData.content === "string" + ? itemData.content + : JSON.stringify(itemData.content, null, 2) + : JSON.stringify(itemData, null, 2); + + const label = keyPrefix === "input" ? "Input" : "Output"; + + return ( + {content}} + /> + ); +} diff --git a/llama_stack/ui/components/responses/items/grouped-function-call-item.tsx b/llama_stack/ui/components/responses/items/grouped-function-call-item.tsx new file mode 100644 index 000000000..ded0ced71 --- /dev/null +++ b/llama_stack/ui/components/responses/items/grouped-function-call-item.tsx @@ -0,0 +1,54 @@ +import { + MessageBlock, + ToolCallBlock, +} from "@/components/ui/message-components"; +import { FunctionCallItem, FunctionCallOutputItem } from "../utils/item-types"; + +interface GroupedFunctionCallItemProps { + functionCall: FunctionCallItem; + output: FunctionCallOutputItem; + index: number; + keyPrefix: string; +} + +export function GroupedFunctionCallItemComponent({ + functionCall, + output, + index, + keyPrefix, +}: GroupedFunctionCallItemProps) { + const name = functionCall.name || "unknown"; + const args = functionCall.arguments || "{}"; + + // Extract the output content from function_call_output + let outputContent = ""; + if (output.output) { + outputContent = + typeof output.output === "string" + ? output.output + : JSON.stringify(output.output); + } else { + outputContent = JSON.stringify(output, null, 2); + } + + const functionCallContent = ( +
    +
    + Arguments + {`${name}(${args})`} +
    +
    + Output + {outputContent} +
    +
    + ); + + return ( + + ); +} diff --git a/llama_stack/ui/components/responses/items/index.ts b/llama_stack/ui/components/responses/items/index.ts new file mode 100644 index 000000000..d7bcc2ea4 --- /dev/null +++ b/llama_stack/ui/components/responses/items/index.ts @@ -0,0 +1,6 @@ +export { MessageItemComponent } from "./message-item"; +export { FunctionCallItemComponent } from "./function-call-item"; +export { WebSearchItemComponent } from "./web-search-item"; +export { GenericItemComponent } from "./generic-item"; +export { GroupedFunctionCallItemComponent } from "./grouped-function-call-item"; +export { ItemRenderer } from "./item-renderer"; diff --git a/llama_stack/ui/components/responses/items/item-renderer.tsx b/llama_stack/ui/components/responses/items/item-renderer.tsx new file mode 100644 index 000000000..8f65d50c4 --- /dev/null +++ b/llama_stack/ui/components/responses/items/item-renderer.tsx @@ -0,0 +1,60 @@ +import { + isMessageItem, + isFunctionCallItem, + isWebSearchCallItem, + AnyResponseItem, +} from "../utils/item-types"; +import { MessageItemComponent } from "./message-item"; +import { FunctionCallItemComponent } from "./function-call-item"; +import { WebSearchItemComponent } from "./web-search-item"; +import { GenericItemComponent } from "./generic-item"; + +interface ItemRendererProps { + item: AnyResponseItem; + index: number; + keyPrefix: string; + defaultRole?: string; +} + +export function ItemRenderer({ + item, + index, + keyPrefix, + defaultRole = "unknown", +}: ItemRendererProps) { + if (isMessageItem(item)) { + return ( + + ); + } + + if (isFunctionCallItem(item)) { + return ( + + ); + } + + if (isWebSearchCallItem(item)) { + return ( + + ); + } + + // Fallback to generic item for unknown types + return ( + + ); +} diff --git a/llama_stack/ui/components/responses/items/message-item.tsx b/llama_stack/ui/components/responses/items/message-item.tsx new file mode 100644 index 000000000..532fddfaa --- /dev/null +++ b/llama_stack/ui/components/responses/items/message-item.tsx @@ -0,0 +1,41 @@ +import { MessageBlock } from "@/components/ui/message-components"; +import { MessageItem } from "../utils/item-types"; + +interface MessageItemProps { + item: MessageItem; + index: number; + keyPrefix: string; + defaultRole?: string; +} + +export function MessageItemComponent({ + item, + index, + keyPrefix, + defaultRole = "unknown", +}: MessageItemProps) { + let content = ""; + + if (typeof item.content === "string") { + content = item.content; + } else if (Array.isArray(item.content)) { + content = item.content + .map((c) => { + return c.type === "input_text" || c.type === "output_text" + ? c.text + : JSON.stringify(c); + }) + .join(" "); + } + + const role = item.role || defaultRole; + const label = role.charAt(0).toUpperCase() + role.slice(1); + + return ( + + ); +} diff --git a/llama_stack/ui/components/responses/items/web-search-item.tsx b/llama_stack/ui/components/responses/items/web-search-item.tsx new file mode 100644 index 000000000..aaa5741ce --- /dev/null +++ b/llama_stack/ui/components/responses/items/web-search-item.tsx @@ -0,0 +1,28 @@ +import { + MessageBlock, + ToolCallBlock, +} from "@/components/ui/message-components"; +import { WebSearchCallItem } from "../utils/item-types"; + +interface WebSearchItemProps { + item: WebSearchCallItem; + index: number; + keyPrefix: string; +} + +export function WebSearchItemComponent({ + item, + index, + keyPrefix, +}: WebSearchItemProps) { + const formattedWebSearch = `web_search_call(status: ${item.status})`; + + return ( + {formattedWebSearch}} + /> + ); +} diff --git a/llama_stack/ui/components/responses/responses-detail.test.tsx b/llama_stack/ui/components/responses/responses-detail.test.tsx new file mode 100644 index 000000000..f426dc059 --- /dev/null +++ b/llama_stack/ui/components/responses/responses-detail.test.tsx @@ -0,0 +1,777 @@ +import React from "react"; +import { render, screen } from "@testing-library/react"; +import "@testing-library/jest-dom"; +import { ResponseDetailView } from "./responses-detail"; +import { OpenAIResponse, InputItemListResponse } from "@/lib/types"; + +describe("ResponseDetailView", () => { + const defaultProps = { + response: null, + inputItems: null, + isLoading: false, + isLoadingInputItems: false, + error: null, + inputItemsError: null, + id: "test_id", + }; + + describe("Loading State", () => { + test("renders loading skeleton when isLoading is true", () => { + const { container } = render( + , + ); + + // Check for skeleton elements + const skeletons = container.querySelectorAll('[data-slot="skeleton"]'); + expect(skeletons.length).toBeGreaterThan(0); + + // The title is replaced by a skeleton when loading, so we shouldn't expect the text + }); + }); + + describe("Error State", () => { + test("renders error message when error prop is provided", () => { + const errorMessage = "Network Error"; + render( + , + ); + + expect(screen.getByText("Responses Details")).toBeInTheDocument(); + // The error message is split across elements, so we check for parts + expect( + screen.getByText(/Error loading details for ID/), + ).toBeInTheDocument(); + expect(screen.getByText(/test_id/)).toBeInTheDocument(); + expect(screen.getByText(/Network Error/)).toBeInTheDocument(); + }); + + test("renders default error message when error.message is not available", () => { + render( + , + ); + + expect( + screen.getByText(/Error loading details for ID/), + ).toBeInTheDocument(); + expect(screen.getByText(/test_id/)).toBeInTheDocument(); + }); + }); + + describe("Not Found State", () => { + test("renders not found message when response is null and not loading/error", () => { + render(); + + expect(screen.getByText("Responses Details")).toBeInTheDocument(); + // The message is split across elements + expect(screen.getByText(/No details found for ID:/)).toBeInTheDocument(); + expect(screen.getByText(/test_id/)).toBeInTheDocument(); + }); + }); + + describe("Response Data Rendering", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "llama-test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: "Test response output", + }, + ], + input: [ + { + type: "message", + role: "user", + content: "Test input message", + }, + ], + temperature: 0.7, + top_p: 0.9, + parallel_tool_calls: true, + previous_response_id: "prev_resp_456", + }; + + test("renders response data with input and output sections", () => { + render(); + + // Check main sections + expect(screen.getByText("Responses Details")).toBeInTheDocument(); + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Output")).toBeInTheDocument(); + + // Check input content + expect(screen.getByText("Test input message")).toBeInTheDocument(); + expect(screen.getByText("User")).toBeInTheDocument(); + + // Check output content + expect(screen.getByText("Test response output")).toBeInTheDocument(); + expect(screen.getByText("Assistant")).toBeInTheDocument(); + }); + + test("renders properties sidebar with all response metadata", () => { + render(); + + // Check properties - use regex to handle text split across elements + expect(screen.getByText(/Created/)).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710000000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + + // Check for the specific ID label (not Previous Response ID) + expect( + screen.getByText((content, element) => { + return element?.tagName === "STRONG" && content === "ID:"; + }), + ).toBeInTheDocument(); + expect(screen.getByText("resp_123")).toBeInTheDocument(); + + expect(screen.getByText(/Model/)).toBeInTheDocument(); + expect(screen.getByText("llama-test-model")).toBeInTheDocument(); + + expect(screen.getByText(/Status/)).toBeInTheDocument(); + expect(screen.getByText("completed")).toBeInTheDocument(); + + expect(screen.getByText(/Temperature/)).toBeInTheDocument(); + expect(screen.getByText("0.7")).toBeInTheDocument(); + + expect(screen.getByText(/Top P/)).toBeInTheDocument(); + expect(screen.getByText("0.9")).toBeInTheDocument(); + + expect(screen.getByText(/Parallel Tool Calls/)).toBeInTheDocument(); + expect(screen.getByText("Yes")).toBeInTheDocument(); + + expect(screen.getByText(/Previous Response ID/)).toBeInTheDocument(); + expect(screen.getByText("prev_resp_456")).toBeInTheDocument(); + }); + + test("handles optional properties correctly", () => { + const minimalResponse: OpenAIResponse = { + id: "resp_minimal", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [], + }; + + render( + , + ); + + // Should show required properties + expect(screen.getByText("resp_minimal")).toBeInTheDocument(); + expect(screen.getByText("test-model")).toBeInTheDocument(); + expect(screen.getByText("completed")).toBeInTheDocument(); + + // Should not show optional properties + expect(screen.queryByText("Temperature")).not.toBeInTheDocument(); + expect(screen.queryByText("Top P")).not.toBeInTheDocument(); + expect(screen.queryByText("Parallel Tool Calls")).not.toBeInTheDocument(); + expect( + screen.queryByText("Previous Response ID"), + ).not.toBeInTheDocument(); + }); + + test("renders error information when response has error", () => { + const errorResponse: OpenAIResponse = { + ...mockResponse, + error: { + code: "invalid_request", + message: "The request was invalid", + }, + }; + + render(); + + // The error is shown in the properties sidebar, not as a separate "Error" label + expect( + screen.getByText("invalid_request: The request was invalid"), + ).toBeInTheDocument(); + }); + }); + + describe("Input Items Handling", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [{ type: "message", role: "assistant", content: "output" }], + input: [{ type: "message", role: "user", content: "fallback input" }], + }; + + test("shows loading state for input items", () => { + render( + , + ); + + // Check for skeleton loading in input items section + const { container } = render( + , + ); + + const skeletons = container.querySelectorAll('[data-slot="skeleton"]'); + expect(skeletons.length).toBeGreaterThan(0); + }); + + test("shows error message for input items with fallback", () => { + render( + , + ); + + expect( + screen.getByText( + "Error loading input items: Failed to load input items", + ), + ).toBeInTheDocument(); + expect( + screen.getByText("Falling back to response input data."), + ).toBeInTheDocument(); + + // Should still show fallback input data + expect(screen.getByText("fallback input")).toBeInTheDocument(); + }); + + test("uses input items data when available", () => { + const mockInputItems: InputItemListResponse = { + object: "list", + data: [ + { + type: "message", + role: "user", + content: "input from items API", + }, + ], + }; + + render( + , + ); + + // Should show input items data, not response.input + expect(screen.getByText("input from items API")).toBeInTheDocument(); + expect(screen.queryByText("fallback input")).not.toBeInTheDocument(); + }); + + test("falls back to response.input when input items is empty", () => { + const emptyInputItems: InputItemListResponse = { + object: "list", + data: [], + }; + + render( + , + ); + + // Should show fallback input data + expect(screen.getByText("fallback input")).toBeInTheDocument(); + }); + + test("shows no input message when no data available", () => { + const responseWithoutInput: OpenAIResponse = { + ...mockResponse, + input: [], + }; + + render( + , + ); + + expect(screen.getByText("No input data available.")).toBeInTheDocument(); + }); + }); + + describe("Input Display Components", () => { + test("renders string content input correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "message", + role: "user", + content: "Simple string input", + }, + ], + }; + + render(); + + expect(screen.getByText("Simple string input")).toBeInTheDocument(); + expect(screen.getByText("User")).toBeInTheDocument(); + }); + + test("renders array content input correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "message", + role: "user", + content: [ + { type: "input_text", text: "First part" }, + { type: "output_text", text: "Second part" }, + ], + }, + ], + }; + + render(); + + expect(screen.getByText("First part Second part")).toBeInTheDocument(); + expect(screen.getByText("User")).toBeInTheDocument(); + }); + + test("renders non-message input types correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "function_call", + content: "function call content", + }, + ], + }; + + render(); + + expect(screen.getByText("function call content")).toBeInTheDocument(); + // Use getAllByText to find the specific "Input" with the type detail + const inputElements = screen.getAllByText("Input"); + expect(inputElements.length).toBeGreaterThan(0); + expect(screen.getByText("(function_call)")).toBeInTheDocument(); + }); + + test("handles input with object content", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "custom_type", + content: JSON.stringify({ key: "value", nested: { data: "test" } }), + }, + ], + }; + + render(); + + // Should show JSON stringified content (without quotes around keys in the rendered output) + expect(screen.getByText(/key.*value/)).toBeInTheDocument(); + // Use getAllByText to find the specific "Input" with the type detail + const inputElements = screen.getAllByText("Input"); + expect(inputElements.length).toBeGreaterThan(0); + expect(screen.getByText("(custom_type)")).toBeInTheDocument(); + }); + + test("renders function call input correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "function_call", + id: "call_456", + status: "completed", + name: "input_function", + arguments: '{"param": "value"}', + }, + ], + }; + + render(); + + expect( + screen.getByText('input_function({"param": "value"})'), + ).toBeInTheDocument(); + expect(screen.getByText("Function Call")).toBeInTheDocument(); + }); + + test("renders web search call input correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "web_search_call", + id: "search_789", + status: "completed", + }, + ], + }; + + render(); + + expect( + screen.getByText("web_search_call(status: completed)"), + ).toBeInTheDocument(); + expect(screen.getByText("Function Call")).toBeInTheDocument(); + expect(screen.getByText("(Web Search)")).toBeInTheDocument(); + }); + }); + + describe("Output Display Components", () => { + test("renders message output with string content", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: "Simple string output", + }, + ], + input: [], + }; + + render(); + + expect(screen.getByText("Simple string output")).toBeInTheDocument(); + expect(screen.getByText("Assistant")).toBeInTheDocument(); + }); + + test("renders message output with array content", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: [ + { type: "output_text", text: "First output" }, + { type: "input_text", text: "Second output" }, + ], + }, + ], + input: [], + }; + + render(); + + expect( + screen.getByText("First output Second output"), + ).toBeInTheDocument(); + expect(screen.getByText("Assistant")).toBeInTheDocument(); + }); + + test("renders function call output correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + name: "search_function", + arguments: '{"query": "test"}', + }, + ], + input: [], + }; + + render(); + + expect( + screen.getByText('search_function({"query": "test"})'), + ).toBeInTheDocument(); + expect(screen.getByText("Function Call")).toBeInTheDocument(); + }); + + test("renders function call output without arguments", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + name: "simple_function", + }, + ], + input: [], + }; + + render(); + + expect(screen.getByText("simple_function({})")).toBeInTheDocument(); + expect(screen.getByText(/Function Call/)).toBeInTheDocument(); + }); + + test("renders web search call output correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "web_search_call", + id: "search_123", + status: "completed", + }, + ], + input: [], + }; + + render(); + + expect( + screen.getByText("web_search_call(status: completed)"), + ).toBeInTheDocument(); + expect(screen.getByText(/Function Call/)).toBeInTheDocument(); + expect(screen.getByText("(Web Search)")).toBeInTheDocument(); + }); + + test("renders unknown output types with JSON fallback", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "unknown_type", + custom_field: "custom_value", + data: { nested: "object" }, + } as any, + ], + input: [], + }; + + render(); + + // Should show JSON stringified content + expect( + screen.getByText(/custom_field.*custom_value/), + ).toBeInTheDocument(); + expect(screen.getByText("(unknown_type)")).toBeInTheDocument(); + }); + + test("shows no output message when output array is empty", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [], + }; + + render(); + + expect(screen.getByText("No output data available.")).toBeInTheDocument(); + }); + + test("groups function call with its output correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + name: "get_weather", + arguments: '{"city": "Tokyo"}', + }, + { + type: "message", + role: "assistant", + call_id: "call_123", + content: "sunny and warm", + } as any, // Using any to bypass the type restriction for this test + ], + input: [], + }; + + render(); + + // Should show the function call and message as separate items (not grouped) + expect(screen.getByText("Function Call")).toBeInTheDocument(); + expect( + screen.getByText('get_weather({"city": "Tokyo"})'), + ).toBeInTheDocument(); + expect(screen.getByText("Assistant")).toBeInTheDocument(); + expect(screen.getByText("sunny and warm")).toBeInTheDocument(); + + // Should NOT have the grouped "Arguments" and "Output" labels + expect(screen.queryByText("Arguments")).not.toBeInTheDocument(); + }); + + test("groups function call with function_call_output correctly", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + call_id: "call_123", + status: "completed", + name: "get_weather", + arguments: '{"city": "Tokyo"}', + }, + { + type: "function_call_output", + id: "fc_68364957013081...", + status: "completed", + call_id: "call_123", + output: "sunny and warm", + } as any, // Using any to bypass the type restriction for this test + ], + input: [], + }; + + render(); + + // Should show the function call grouped with its clean output + expect(screen.getByText("Function Call")).toBeInTheDocument(); + expect(screen.getByText("Arguments")).toBeInTheDocument(); + expect( + screen.getByText('get_weather({"city": "Tokyo"})'), + ).toBeInTheDocument(); + // Use getAllByText since there are multiple "Output" elements (card title and output label) + const outputElements = screen.getAllByText("Output"); + expect(outputElements.length).toBeGreaterThan(0); + expect(screen.getByText("sunny and warm")).toBeInTheDocument(); + }); + }); + + describe("Edge Cases and Error Handling", () => { + test("handles missing role in message input", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [], + input: [ + { + type: "message", + content: "Message without role", + }, + ], + }; + + render(); + + expect(screen.getByText("Message without role")).toBeInTheDocument(); + expect(screen.getByText("Unknown")).toBeInTheDocument(); // Default role + }); + + test("handles missing name in function call output", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + }, + ], + input: [], + }; + + render(); + + // When name is missing, it falls back to JSON.stringify of the entire output + const functionCallElements = screen.getAllByText(/function_call/); + expect(functionCallElements.length).toBeGreaterThan(0); + expect(screen.getByText(/call_123/)).toBeInTheDocument(); + }); + }); +}); diff --git a/llama_stack/ui/components/responses/responses-detail.tsx b/llama_stack/ui/components/responses/responses-detail.tsx new file mode 100644 index 000000000..c8c447ba4 --- /dev/null +++ b/llama_stack/ui/components/responses/responses-detail.tsx @@ -0,0 +1,171 @@ +"use client"; + +import { OpenAIResponse, InputItemListResponse } from "@/lib/types"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Skeleton } from "@/components/ui/skeleton"; +import { + DetailLoadingView, + DetailErrorView, + DetailNotFoundView, + DetailLayout, + PropertiesCard, + PropertyItem, +} from "@/components/layout/detail-layout"; +import { GroupedItemsDisplay } from "./grouping/grouped-items-display"; + +interface ResponseDetailViewProps { + response: OpenAIResponse | null; + inputItems: InputItemListResponse | null; + isLoading: boolean; + isLoadingInputItems: boolean; + error: Error | null; + inputItemsError: Error | null; + id: string; +} + +export function ResponseDetailView({ + response, + inputItems, + isLoading, + isLoadingInputItems, + error, + inputItemsError, + id, +}: ResponseDetailViewProps) { + const title = "Responses Details"; + + if (error) { + return ; + } + + if (isLoading) { + return ; + } + + if (!response) { + return ; + } + + // Main content cards + const mainContent = ( + <> + + + Input + + + {/* Show loading state for input items */} + {isLoadingInputItems ? ( +
    + + + +
    + ) : inputItemsError ? ( +
    + Error loading input items: {inputItemsError.message} +
    + + Falling back to response input data. + +
    + ) : null} + + {/* Display input items if available, otherwise fall back to response.input */} + {(() => { + const dataToDisplay = + inputItems?.data && inputItems.data.length > 0 + ? inputItems.data + : response.input; + + if (dataToDisplay && dataToDisplay.length > 0) { + return ( + + ); + } else { + return ( +

    + No input data available. +

    + ); + } + })()} +
    +
    + + + + Output + + + {response.output?.length > 0 ? ( + + ) : ( +

    + No output data available. +

    + )} +
    +
    + + ); + + // Properties sidebar + const sidebar = ( + + + + + + {response.temperature && ( + + )} + {response.top_p && } + {response.parallel_tool_calls && ( + + )} + {response.previous_response_id && ( + {response.previous_response_id} + } + hasBorder + /> + )} + {response.error && ( + + {response.error.code}: {response.error.message} + + } + className="pt-1 mt-1 border-t border-red-200" + /> + )} + + ); + + return ( + + ); +} diff --git a/llama_stack/ui/components/responses/responses-table.test.tsx b/llama_stack/ui/components/responses/responses-table.test.tsx new file mode 100644 index 000000000..7c45c57d3 --- /dev/null +++ b/llama_stack/ui/components/responses/responses-table.test.tsx @@ -0,0 +1,537 @@ +import React from "react"; +import { render, screen, fireEvent } from "@testing-library/react"; +import "@testing-library/jest-dom"; +import { ResponsesTable } from "./responses-table"; +import { OpenAIResponse } from "@/lib/types"; + +// Mock next/navigation +const mockPush = jest.fn(); +jest.mock("next/navigation", () => ({ + useRouter: () => ({ + push: mockPush, + }), +})); + +// Mock helper functions +jest.mock("@/lib/truncate-text"); + +// Import the mocked functions +import { truncateText as originalTruncateText } from "@/lib/truncate-text"; + +// Cast to jest.Mock for typings +const truncateText = originalTruncateText as jest.Mock; + +describe("ResponsesTable", () => { + const defaultProps = { + data: [] as OpenAIResponse[], + isLoading: false, + error: null, + }; + + beforeEach(() => { + // Reset all mocks before each test + mockPush.mockClear(); + truncateText.mockClear(); + + // Default pass-through implementation + truncateText.mockImplementation((text: string | undefined) => text); + }); + + test("renders without crashing with default props", () => { + render(); + expect(screen.getByText("No responses found.")).toBeInTheDocument(); + }); + + test("click on a row navigates to the correct URL", () => { + const mockResponse: OpenAIResponse = { + id: "resp_123", + object: "response", + created_at: Math.floor(Date.now() / 1000), + model: "llama-test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: "Test output", + }, + ], + input: [ + { + type: "message", + role: "user", + content: "Test input", + }, + ], + }; + + render(); + + const row = screen.getByText("Test input").closest("tr"); + if (row) { + fireEvent.click(row); + expect(mockPush).toHaveBeenCalledWith("/logs/responses/resp_123"); + } else { + throw new Error('Row with "Test input" not found for router mock test.'); + } + }); + + describe("Loading State", () => { + test("renders skeleton UI when isLoading is true", () => { + const { container } = render( + , + ); + + // Check for skeleton in the table caption + const tableCaption = container.querySelector("caption"); + expect(tableCaption).toBeInTheDocument(); + if (tableCaption) { + const captionSkeleton = tableCaption.querySelector( + '[data-slot="skeleton"]', + ); + expect(captionSkeleton).toBeInTheDocument(); + } + + // Check for skeletons in the table body cells + const tableBody = container.querySelector("tbody"); + expect(tableBody).toBeInTheDocument(); + if (tableBody) { + const bodySkeletons = tableBody.querySelectorAll( + '[data-slot="skeleton"]', + ); + expect(bodySkeletons.length).toBeGreaterThan(0); + } + }); + }); + + describe("Error State", () => { + test("renders error message when error prop is provided", () => { + const errorMessage = "Network Error"; + render( + , + ); + expect( + screen.getByText(`Error fetching data: ${errorMessage}`), + ).toBeInTheDocument(); + }); + + test("renders default error message when error.message is not available", () => { + render( + , + ); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + + test("renders default error message when error prop is an object without message", () => { + render(); + expect( + screen.getByText("Error fetching data: An unknown error occurred"), + ).toBeInTheDocument(); + }); + }); + + describe("Empty State", () => { + test('renders "No responses found." and no table when data array is empty', () => { + render(); + expect(screen.getByText("No responses found.")).toBeInTheDocument(); + + // Ensure that the table structure is NOT rendered in the empty state + const table = screen.queryByRole("table"); + expect(table).not.toBeInTheDocument(); + }); + }); + + describe("Data Rendering", () => { + test("renders table caption, headers, and response data correctly", () => { + const mockResponses = [ + { + id: "resp_1", + object: "response" as const, + created_at: 1710000000, + model: "llama-test-model", + status: "completed", + output: [ + { + type: "message" as const, + role: "assistant" as const, + content: "Test output", + }, + ], + input: [ + { + type: "message", + role: "user", + content: "Test input", + }, + ], + }, + { + id: "resp_2", + object: "response" as const, + created_at: 1710001000, + model: "llama-another-model", + status: "completed", + output: [ + { + type: "message" as const, + role: "assistant" as const, + content: "Another output", + }, + ], + input: [ + { + type: "message", + role: "user", + content: "Another input", + }, + ], + }, + ]; + + render( + , + ); + + // Table caption + expect( + screen.getByText("A list of your recent responses."), + ).toBeInTheDocument(); + + // Table headers + expect(screen.getByText("Input")).toBeInTheDocument(); + expect(screen.getByText("Output")).toBeInTheDocument(); + expect(screen.getByText("Model")).toBeInTheDocument(); + expect(screen.getByText("Created")).toBeInTheDocument(); + + // Data rows + expect(screen.getByText("Test input")).toBeInTheDocument(); + expect(screen.getByText("Test output")).toBeInTheDocument(); + expect(screen.getByText("llama-test-model")).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710000000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + + expect(screen.getByText("Another input")).toBeInTheDocument(); + expect(screen.getByText("Another output")).toBeInTheDocument(); + expect(screen.getByText("llama-another-model")).toBeInTheDocument(); + expect( + screen.getByText(new Date(1710001000 * 1000).toLocaleString()), + ).toBeInTheDocument(); + }); + }); + + describe("Input Text Extraction", () => { + test("extracts text from string content", () => { + const mockResponse: OpenAIResponse = { + id: "resp_string", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [{ type: "message", role: "assistant", content: "output" }], + input: [ + { + type: "message", + role: "user", + content: "Simple string input", + }, + ], + }; + + render( + , + ); + expect(screen.getByText("Simple string input")).toBeInTheDocument(); + }); + + test("extracts text from array content with input_text type", () => { + const mockResponse: OpenAIResponse = { + id: "resp_array", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [{ type: "message", role: "assistant", content: "output" }], + input: [ + { + type: "message", + role: "user", + content: [ + { type: "input_text", text: "Array input text" }, + { type: "input_text", text: "Should not be used" }, + ], + }, + ], + }; + + render( + , + ); + expect(screen.getByText("Array input text")).toBeInTheDocument(); + }); + + test("returns empty string when no message input found", () => { + const mockResponse: OpenAIResponse = { + id: "resp_no_input", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [{ type: "message", role: "assistant", content: "output" }], + input: [ + { + type: "other_type", + content: "Not a message", + }, + ], + }; + + const { container } = render( + , + ); + + // Find the input cell (first cell in the data row) and verify it's empty + const inputCell = container.querySelector("tbody tr td:first-child"); + expect(inputCell).toBeInTheDocument(); + expect(inputCell).toHaveTextContent(""); + }); + }); + + describe("Output Text Extraction", () => { + test("extracts text from string message content", () => { + const mockResponse: OpenAIResponse = { + id: "resp_string_output", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: "Simple string output", + }, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + expect(screen.getByText("Simple string output")).toBeInTheDocument(); + }); + + test("extracts text from array message content with output_text type", () => { + const mockResponse: OpenAIResponse = { + id: "resp_array_output", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: [ + { type: "output_text", text: "Array output text" }, + { type: "output_text", text: "Should not be used" }, + ], + }, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + expect(screen.getByText("Array output text")).toBeInTheDocument(); + }); + + test("formats function call output", () => { + const mockResponse: OpenAIResponse = { + id: "resp_function_call", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + name: "search_function", + arguments: '{"query": "test"}', + }, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + expect( + screen.getByText('search_function({"query": "test"})'), + ).toBeInTheDocument(); + }); + + test("formats function call output without arguments", () => { + const mockResponse: OpenAIResponse = { + id: "resp_function_no_args", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "function_call", + id: "call_123", + status: "completed", + name: "simple_function", + }, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + expect(screen.getByText("simple_function({})")).toBeInTheDocument(); + }); + + test("formats web search call output", () => { + const mockResponse: OpenAIResponse = { + id: "resp_web_search", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "web_search_call", + id: "search_123", + status: "completed", + }, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + expect( + screen.getByText("web_search_call(status: completed)"), + ).toBeInTheDocument(); + }); + + test("falls back to JSON.stringify for unknown tool call types", () => { + const mockResponse: OpenAIResponse = { + id: "resp_unknown_tool", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "unknown_call", + id: "unknown_123", + status: "completed", + custom_field: "custom_value", + } as any, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + // Should contain the JSON stringified version + expect(screen.getByText(/unknown_call/)).toBeInTheDocument(); + }); + + test("falls back to JSON.stringify for entire output when no message or tool call found", () => { + const mockResponse: OpenAIResponse = { + id: "resp_fallback", + object: "response", + created_at: 1710000000, + model: "test-model", + status: "completed", + output: [ + { + type: "unknown_type", + data: "some data", + } as any, + ], + input: [{ type: "message", content: "input" }], + }; + + render( + , + ); + // Should contain the JSON stringified version of the output array + expect(screen.getByText(/unknown_type/)).toBeInTheDocument(); + }); + }); + + describe("Text Truncation", () => { + test("truncates long input and output text", () => { + // Specific mock implementation for this test + truncateText.mockImplementation( + (text: string | undefined, maxLength?: number) => { + const defaultTestMaxLength = 10; + const effectiveMaxLength = maxLength ?? defaultTestMaxLength; + return typeof text === "string" && text.length > effectiveMaxLength + ? text.slice(0, effectiveMaxLength) + "..." + : text; + }, + ); + + const longInput = + "This is a very long input message that should be truncated."; + const longOutput = + "This is a very long output message that should also be truncated."; + + const mockResponse: OpenAIResponse = { + id: "resp_trunc", + object: "response", + created_at: 1710002000, + model: "llama-trunc-model", + status: "completed", + output: [ + { + type: "message", + role: "assistant", + content: longOutput, + }, + ], + input: [ + { + type: "message", + role: "user", + content: longInput, + }, + ], + }; + + render( + , + ); + + // The truncated text should be present for both input and output + const truncatedTexts = screen.getAllByText( + longInput.slice(0, 10) + "...", + ); + expect(truncatedTexts.length).toBe(2); // one for input, one for output + truncatedTexts.forEach((textElement) => + expect(textElement).toBeInTheDocument(), + ); + }); + }); +}); diff --git a/llama_stack/ui/components/responses/responses-table.tsx b/llama_stack/ui/components/responses/responses-table.tsx new file mode 100644 index 000000000..352450d18 --- /dev/null +++ b/llama_stack/ui/components/responses/responses-table.tsx @@ -0,0 +1,117 @@ +"use client"; + +import { + OpenAIResponse, + ResponseInput, + ResponseInputMessageContent, +} from "@/lib/types"; +import { LogsTable, LogTableRow } from "@/components/logs/logs-table"; +import { + isMessageInput, + isMessageItem, + isFunctionCallItem, + isWebSearchCallItem, + MessageItem, + FunctionCallItem, + WebSearchCallItem, +} from "./utils/item-types"; + +interface ResponsesTableProps { + data: OpenAIResponse[]; + isLoading: boolean; + error: Error | null; +} + +function getInputText(response: OpenAIResponse): string { + const firstInput = response.input.find(isMessageInput); + if (firstInput) { + return extractContentFromItem(firstInput); + } + return ""; +} + +function getOutputText(response: OpenAIResponse): string { + const firstMessage = response.output.find((item) => + isMessageItem(item as any), + ); + if (firstMessage) { + const content = extractContentFromItem(firstMessage as MessageItem); + if (content) { + return content; + } + } + + const functionCall = response.output.find((item) => + isFunctionCallItem(item as any), + ); + if (functionCall) { + return formatFunctionCall(functionCall as FunctionCallItem); + } + + const webSearchCall = response.output.find((item) => + isWebSearchCallItem(item as any), + ); + if (webSearchCall) { + return formatWebSearchCall(webSearchCall as WebSearchCallItem); + } + + return JSON.stringify(response.output); +} + +function extractContentFromItem(item: { + content?: string | ResponseInputMessageContent[]; +}): string { + if (!item.content) { + return ""; + } + + if (typeof item.content === "string") { + return item.content; + } else if (Array.isArray(item.content)) { + const textContent = item.content.find( + (c: ResponseInputMessageContent) => + c.type === "input_text" || c.type === "output_text", + ); + return textContent?.text || ""; + } + return ""; +} + +function formatFunctionCall(functionCall: FunctionCallItem): string { + const args = functionCall.arguments || "{}"; + const name = functionCall.name || "unknown"; + return `${name}(${args})`; +} + +function formatWebSearchCall(webSearchCall: WebSearchCallItem): string { + return `web_search_call(status: ${webSearchCall.status})`; +} + +function formatResponseToRow(response: OpenAIResponse): LogTableRow { + return { + id: response.id, + input: getInputText(response), + output: getOutputText(response), + model: response.model, + createdTime: new Date(response.created_at * 1000).toLocaleString(), + detailPath: `/logs/responses/${response.id}`, + }; +} + +export function ResponsesTable({ + data, + isLoading, + error, +}: ResponsesTableProps) { + const formattedData = data.map(formatResponseToRow); + + return ( + + ); +} diff --git a/llama_stack/ui/components/responses/utils/item-types.ts b/llama_stack/ui/components/responses/utils/item-types.ts new file mode 100644 index 000000000..2bde49119 --- /dev/null +++ b/llama_stack/ui/components/responses/utils/item-types.ts @@ -0,0 +1,61 @@ +/** + * Type guards for different item types in responses + */ + +import type { + ResponseInput, + ResponseOutput, + ResponseMessage, + ResponseToolCall, +} from "@/lib/types"; + +export interface BaseItem { + type: string; + [key: string]: unknown; +} + +export type MessageItem = ResponseMessage; +export type FunctionCallItem = ResponseToolCall & { type: "function_call" }; +export type WebSearchCallItem = ResponseToolCall & { type: "web_search_call" }; +export type FunctionCallOutputItem = BaseItem & { + type: "function_call_output"; + call_id: string; + output?: string | object; +}; + +export type AnyResponseItem = + | ResponseInput + | ResponseOutput + | FunctionCallOutputItem; + +export function isMessageInput( + item: ResponseInput, +): item is ResponseInput & { type: "message" } { + return item.type === "message"; +} + +export function isMessageItem(item: AnyResponseItem): item is MessageItem { + return item.type === "message" && "content" in item; +} + +export function isFunctionCallItem( + item: AnyResponseItem, +): item is FunctionCallItem { + return item.type === "function_call" && "name" in item; +} + +export function isWebSearchCallItem( + item: AnyResponseItem, +): item is WebSearchCallItem { + return item.type === "web_search_call"; +} + +export function isFunctionCallOutputItem( + item: AnyResponseItem, +): item is FunctionCallOutputItem { + return ( + item.type === "function_call_output" && + "call_id" in item && + typeof (item as any).call_id === "string" + ); +} diff --git a/llama_stack/ui/components/ui/breadcrumb.tsx b/llama_stack/ui/components/ui/breadcrumb.tsx new file mode 100644 index 000000000..f63ae19af --- /dev/null +++ b/llama_stack/ui/components/ui/breadcrumb.tsx @@ -0,0 +1,109 @@ +import * as React from "react"; +import { Slot } from "@radix-ui/react-slot"; +import { ChevronRight, MoreHorizontal } from "lucide-react"; + +import { cn } from "@/lib/utils"; + +function Breadcrumb({ ...props }: React.ComponentProps<"nav">) { + return