Merge branch 'meta-llama:main' into main

2026-01-06 21:59:59 +00:00 · 2025-01-27 15:00:07 -05:00 · 2025-01-27 15:00:07 -05:00 · d77fdf2d06
commit d77fdf2d06
parent b61108c3f0 3c1a2c3d66
361 changed files with 24356 additions and 17233 deletions
--- a/.github/workflows/publish-to-docker.yml
+++ b/.github/workflows/publish-to-docker.yml
@ -0,0 +1,148 @@
+name: Docker Build and Publish
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'TestPyPI or PyPI version to build (e.g., 0.0.63.dev20250114)'
+        required: true
+        type: string
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Set version
+        id: version
+        run: |
+          if [ "${{ github.event_name }}" = "push" ]; then
+            echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
+          else
+            echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Check package version availability
+        run: |
+            # Function to check if version exists in a repository
+            check_version() {
+                local repo=$1
+                local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
+                echo "Checking version $VERSION_TO_CHECK in $repo"
+                result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
+                echo "Result: $result"
+                return $([ "$result" = "true" ])
+            }
+
+            # Check TestPyPI first, then PyPI
+            if check_version "test.pypi"; then
+                echo "Version ${{ steps.version.outputs.version }} found in TestPyPI"
+                echo "PYPI_SOURCE=testpypi" >> $GITHUB_ENV
+            elif check_version "pypi"; then
+                echo "Version ${{ steps.version.outputs.version }} found in PyPI"
+                echo "PYPI_SOURCE=pypi" >> $GITHUB_ENV
+            else
+                echo "Error: Version ${{ steps.version.outputs.version }} not found in either TestPyPI or PyPI"
+                exit 1
+            fi
+
+      - name: Install llama-stack
+        run: |
+            echo "PYPI_SOURCE=${PYPI_SOURCE}"
+            if [ "${{ github.event_name }}" = "push" ]; then
+                pip install -e .
+            else
+                if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                    pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple llama-stack==${{ steps.version.outputs.version }}
+                else
+                    pip install llama-stack==${{ steps.version.outputs.version }}
+                fi
+            fi
+
+      - name: Build docker image
+        run: |
+          echo "PYPI_SOURCE=${PYPI_SOURCE}"
+          echo "VERSION=${{ steps.version.outputs.version }}"
+          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
+          for template in "${TEMPLATES[@]}"; do
+            if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
+            else
+                PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
+            fi
+          done
+
+      - name: List docker images
+        run: |
+          docker images
+
+      # TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
+      - name: Start up built docker image
+        run: |
+          cd distributions/fireworks
+          if [ "$PYPI_SOURCE" = "testpypi" ]; then
+            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
+          else
+            sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
+          fi
+          docker compose up -d
+          cd ..
+          # Wait for the container to start
+          timeout=300
+          while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
+            echo "Waiting for endpoint to be available..."
+            sleep 5
+            timeout=$((timeout - 5))
+          done
+
+          if [ $timeout -le 0 ]; then
+            echo "Timeout waiting for endpoint to become available"
+            exit 1
+          fi
+
+      - name: Run simple models list test on docker server
+        run: |
+          curl http://localhost:8321/v1/models
+
+      # TODO (xiyan): figure out why client cannot find server but curl works
+      # - name: Run pytest on docker server
+      #   run: |
+      #     pip install pytest pytest-md-report
+      #     export LLAMA_STACK_BASE_URL="http://localhost:8321"
+      #     LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
+
+      - name: Push to dockerhub
+        run: |
+          echo "PYPI_SOURCE=${PYPI_SOURCE}"
+          echo "VERSION=${{ steps.version.outputs.version }}"
+          TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
+          for template in "${TEMPLATES[@]}"; do
+            if [ "$PYPI_SOURCE" = "testpypi" ]; then
+                docker tag distribution-$template:test-${{ steps.version.outputs.version }} llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
+                docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
+            else
+                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
+                docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
+                docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
+                docker push llamastack/distribution-$template:latest
+            fi
+          done
--- a/.github/workflows/publish-to-test-pypi.yml
+++ b/.github/workflows/publish-to-test-pypi.yml
@ -0,0 +1,244 @@
+name: Publish Python 🐍 distribution 📦 to TestPyPI
+
+on:
+  workflow_dispatch:  # Keep manual trigger
+    inputs:
+      version:
+        description: 'Version number (e.g. 0.0.63.dev20250111)'
+        required: true
+        type: string
+  schedule:
+    - cron: "0 0 * * *"  # Run every day at midnight
+
+jobs:
+  trigger-client-and-models-build:
+    name: Trigger llama-stack-client and llama-models build
+    runs-on: ubuntu-latest
+    outputs:
+      version: ${{ steps.version.outputs.version }}
+      client_run_id: ${{ steps.trigger-client.outputs.workflow_id }}
+      model_run_id: ${{ steps.trigger-models.outputs.workflow_id }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Get date
+      id: date
+      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+    - name: Compute version based on dispatch event
+      id: version
+      run: |
+        # Read base version from pyproject.toml
+        version=$(sed -n 's/.*version="\([^"]*\)".*/\1/p' setup.py)
+        if [ "${{ github.event_name }}" = "schedule" ]; then
+          echo "version=${version}.dev${{ steps.date.outputs.date }}" >> $GITHUB_OUTPUT
+        elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+          echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
+        else
+          echo "version=${version}.dev$(shuf -i 10000000-99999999 -n 1)" >> $GITHUB_OUTPUT
+        fi
+    - name: Trigger llama-stack-client workflow
+      id: trigger-client
+      run: |
+        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-stack-client-python/dispatches \
+        -H 'Accept: application/vnd.github.everest-preview+json' \
+        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+        --data "{\"event_type\": \"build-client-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
+        -w "\n%{http_code}")
+
+        http_code=$(echo "$response" | tail -n1)
+        if [ "$http_code" != "204" ]; then
+          echo "Failed to trigger client workflow"
+          exit 1
+        fi
+
+        # Get the run ID of the triggered workflow
+        sleep 5  # Wait for workflow to be created
+        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                 "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs?event=repository_dispatch" \
+                 | jq '.workflow_runs[0].id')
+        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
+
+    - name: Trigger llama-models workflow
+      id: trigger-models
+      run: |
+        response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-models/dispatches \
+        -H 'Accept: application/vnd.github.everest-preview+json' \
+        -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+        --data "{\"event_type\": \"build-models-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
+        -w "\n%{http_code}")
+
+        http_code=$(echo "$response" | tail -n1)
+        if [ "$http_code" != "204" ]; then
+          echo "Failed to trigger models workflow"
+          exit 1
+        fi
+
+        # Get the run ID of the triggered workflow
+        sleep 5  # Wait for workflow to be created
+        run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                 "https://api.github.com/repos/meta-llama/llama-models/actions/runs?event=repository_dispatch" \
+                 | jq '.workflow_runs[0].id')
+        echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
+
+  wait-for-workflows:
+    name: Wait for triggered workflows
+    needs: trigger-client-and-models-build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Wait for client workflow
+      run: |
+        while true; do
+          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                   "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
+                   | jq -r '.status')
+          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                      "https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
+                      | jq -r '.conclusion')
+
+          echo "llama-stack-client-python workflow status: $status, conclusion: $conclusion"
+
+          if [ "$status" = "completed" ]; then
+            if [ "$conclusion" != "success" ]; then
+              echo "llama-stack-client-python workflow failed"
+              exit 1
+            fi
+            break
+          fi
+
+          sleep 10
+        done
+
+    - name: Wait for models workflow
+      run: |
+        while true; do
+          status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                   "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
+                   | jq -r '.status')
+          conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
+                      "https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
+                      | jq -r '.conclusion')
+
+          echo "llama-models workflow status: $status, conclusion: $conclusion"
+
+          if [ "$status" = "completed" ]; then
+            if [ "$conclusion" != "success" ]; then
+              echo "llama-models workflow failed"
+              exit 1
+            fi
+            break
+          fi
+
+          sleep 10
+        done
+
+  build:
+    name: Build distribution 📦
+    needs:
+      - wait-for-workflows
+      - trigger-client-and-models-build
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Get date
+      id: date
+      run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+    - name: Update version for nightly
+      run: |
+        sed -i 's/version="\([^"]*\)"/version="${{ needs.trigger-client-and-models-build.outputs.version }}"/' setup.py
+        sed -i 's/llama-stack-client>=\([^"]*\)/llama-stack-client==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
+        sed -i 's/llama-models>=\([^"]*\)/llama-models==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+    - name: Install pypa/build
+      run: >-
+        python3 -m
+        pip install
+        build
+        --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-testpypi:
+    name: Publish Python 🐍 distribution 📦 to TestPyPI
+    needs:
+    - build
+    runs-on: ubuntu-latest
+
+    environment:
+      name: testrelease
+      url: https://test.pypi.org/p/llama-stack
+
+    permissions:
+      id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
+
+  test-published-package:
+    name: Test published package
+    needs:
+      - publish-to-testpypi
+      - trigger-client-and-models-build
+    runs-on: ubuntu-latest
+    env:
+      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Install the package
+      run: |
+        max_attempts=6
+        attempt=1
+        while [ $attempt -le $max_attempts ]; do
+          echo "Attempt $attempt of $max_attempts to install package..."
+          if pip install --no-cache --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ llama-stack==${{ needs.trigger-client-and-models-build.outputs.version }}; then
+            echo "Package installed successfully"
+            break
+          fi
+          if [ $attempt -ge $max_attempts ]; then
+            echo "Failed to install package after $max_attempts attempts"
+            exit 1
+          fi
+          attempt=$((attempt + 1))
+          sleep 10
+        done
+    - name: Test the package versions
+      run: |
+        pip list | grep llama_
+    - name: Test CLI commands
+      run: |
+        llama model list
+        llama stack build --list-templates
+        llama model prompt-format -m Llama3.2-11B-Vision-Instruct
+        llama stack list-apis
+        llama stack list-providers inference
+        llama stack list-providers telemetry
+    - name: Test Notebook
+      run: |
+        pip install pytest nbval
+        llama stack build --template together --image-type venv
+        pytest -v -s --nbval-lax ./docs/getting_started.ipynb
+        pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+
+    # TODO: add trigger for integration test workflow & docker builds
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -12,6 +12,57 @@ We actively welcome your pull requests.
 5. Make sure your code lints.
 6. If you haven't already, complete the Contributor License Agreement ("CLA").

+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+
+## Pre-commit Hooks
+
+We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
+
+```bash
+$ cd llama-stack
+$ conda activate <your-environment>
+$ pip install pre-commit
+$ pre-commit install
+```
+
+After that, pre-commit hooks will run automatically before each commit.
+
+
+## Coding Style
+* 2 spaces for indentation rather than tabs
+* 80 character line length
+* ...
+
+## Common Tasks
+
+Some tips about common tasks you work on while contributing to Llama Stack:
+
+### Using `llama stack build`
+
+Building a stack image (conda / docker) will use the production version of the `llama-stack`, `llama-models` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_MODELS_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+
+Example:
+```bash
+$ cd work/
+$ git clone https://github.com/meta-llama/llama-stack.git
+$ git clone https://github.com/meta-llama/llama-models.git
+$ cd llama-stack
+$ LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template <...>
+```
+

 ### Updating Provider Configurations

@ -31,40 +82,6 @@ make html
 sphinx-autobuild source build/html
 ```

-## Pre-commit Hooks
-
-We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
-
-```bash
-$ cd llama-stack
-$ conda activate <your-environment>
-$ pip install pre-commit
-$ pre-commit install
-```
-
-After that, pre-commit hooks will run automatically before each commit.
-
-## Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Meta's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
-
-## Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
-## Coding Style
-* 2 spaces for indentation rather than tabs
-* 80 character line length
-* ...
-
-## Tips
-* If you are developing with a llama-stack repository checked out and need your distribution to reflect changes from there, set `LLAMA_STACK_DIR` to that dir when running any of the `llama` CLI commands.

 ## License
 By contributing to Llama, you agree that your contributions will be licensed
--- a/README.md
+++ b/README.md
@ -4,9 +4,15 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Zero-to-Hero Guide**](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide)
+[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)

-Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.
+Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides
+
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
+- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
+- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
+- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack

 <div style="text-align: center;">
  <img
@ -17,66 +23,16 @@ Llama Stack defines and standardizes the set of core building blocks needed to b
  />
 </div>

-Our goal is to provide pre-packaged implementations which can be operated in a variety of deployment environments: developers start iterating with Desktops or their mobile devices and can seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
+### Llama Stack Benefits
+- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice.
+- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
+- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.

-> ⚠️ **Note**
-> The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.
+By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.

-
-## APIs
-
-We have working implementations of the following APIs today:
- Inference
- Safety
- Memory
- Agents
- Eval
- Telemetry
-
-Alongside these APIs, we also related APIs for operating with associated resources (see [Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#resources)):
-
- Models
- Shields
- Memory Banks
- Eval Tasks
- Datasets
- Scoring Functions
-
-We are also working on the following APIs which will be released soon:
-
- Post Training
- Synthetic Data Generation
- Reward Scoring
-
-Each of the APIs themselves is a collection of REST endpoints.
-
-## Philosophy
-
-### Service-oriented design
-
-Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from a local to remote deployments, but also forces the design to be more declarative. We believe this restriction can result in a much simpler, robust developer experience. This will necessarily trade-off against expressivity however if we get the APIs right, it can lead to a very powerful platform.
-
-### Composability
-
-We expect the set of APIs we design to be composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
-
-### Turnkey one-stop solutions
-
-We expect to provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or on a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations or fine-tuning services in a matter of minutes. They should all result in the same uniform observability and developer experience.
-
-### Focus on Llama models
-
-As a Meta initiated project, we have started by explicitly focusing on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best.
-
-### Supporting the Ecosystem
-
-There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem.
-
-Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
-
-
-## Supported Llama Stack Implementations
 ### API Providers
+Here is a list of the various API providers and available distributions to developers started easily,
+
 |                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
 |:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
 |                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
@ -87,26 +43,28 @@ Additionally, we have designed every element of the Stack such that APIs as well
 |                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
 |                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
 |                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-| [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
 |                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
 |                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
 |                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
-|                        [vLLM](https://github.com/vllm-project/vllm)                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |

 ### Distributions

+A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code. Here are some of the distributions we support:
+
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |           Meta Reference Quantized            | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
-|                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/cerebras.html)   |
+|                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
 |                      TGI                      |                          [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)                          |             [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html)              |
 |                   Together                    |                     [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)                     |           [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html)           |
 |                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
-| [vLLM](https://github.com/vllm-project/vllm)  |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |
+| vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |

-## Installation
+### Installation

 You have two ways to install this repository:

@ -131,7 +89,7 @@ You have two ways to install this repository:
    pip install -e .
   ```

-## Documentation
+### Documentation

 Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.

@ -139,13 +97,13 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
    * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
 * [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
    * Quick guide to start a Llama Stack server.
-    * [Jupyter notebook](./docs/notebooks/Llama_Stack_Building_AI_Applications.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
+    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.

-## Llama Stack Client SDKs
+### Llama Stack Client SDKs

 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -5,7 +5,7 @@ services:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-bedrock.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
    deploy:
      restart_policy:
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -6,7 +6,7 @@ services:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-cerebras.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
    deploy:
      restart_policy:
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -40,7 +40,7 @@ services:
      # Link to TGI run.yaml file
      - ./run.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: local
-docker_image: null
+container_image: null
 conda_env: local
 apis:
 - shields
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -1,4 +1,34 @@
 {
+  "sambanova": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "hf-serverless": [
    "aiohttp",
    "aiosqlite",
@ -13,6 +43,7 @@
    "httpx",
    "huggingface_hub",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -45,6 +76,7 @@
    "fire",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -78,6 +110,7 @@
    "fire",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -101,14 +134,17 @@
  ],
  "remote-vllm": [
    "aiosqlite",
+    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
+    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -142,6 +178,7 @@
    "fireworks-ai",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -176,6 +213,7 @@
    "httpx",
    "huggingface_hub",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -209,6 +247,7 @@
    "fire",
    "httpx",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -244,6 +283,7 @@
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -268,6 +308,38 @@
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "nvidia": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "meta-reference-quantized-gpu": [
    "accelerate",
    "aiosqlite",
@ -284,6 +356,7 @@
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
@ -311,9 +384,12 @@
  ],
  "cerebras": [
    "aiosqlite",
+    "autoevals",
    "blobfile",
    "cerebras_cloud_sdk",
    "chardet",
+    "chromadb-client",
+    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
@ -321,6 +397,7 @@
    "matplotlib",
    "nltk",
    "numpy",
+    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
@ -386,6 +463,7 @@
    "httpx",
    "huggingface_hub",
    "matplotlib",
+    "mcp",
    "nltk",
    "numpy",
    "openai",
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -1,13 +1,11 @@
 services:
  llamastack:
    image: llamastack/distribution-fireworks
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-fireworks.yaml
    ports:
-      - "5000:5000"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
+      - "8321:8321"
+    environment:
+      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
    deploy:
      restart_policy:
        condition: on-failure
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -6,7 +6,7 @@ services:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -6,7 +6,7 @@ services:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: local
-docker_image: null
+container_image: null
 conda_env: local
 apis:
 - shields
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -0,0 +1,19 @@
+services:
+  llamastack:
+    image: distribution-nvidia:dev
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-nvidia.yaml
+    ports:
+      - "8321:8321"
+    environment:
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -85,7 +85,7 @@ services:
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
-      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -0,0 +1,9 @@
+name: runpod
+distribution_spec:
+  description: Use Runpod for running LLM inference
+  providers:
+    inference: remote::runpod
+    memory: meta-reference
+    safety: meta-reference
+    agents: meta-reference
+    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -0,0 +1,19 @@
+version: '2'
+name: sambanova
+distribution_spec:
+  description: Use SambaNova.AI for running LLM inference
+  docker_image: null
+  providers:
+    inference:
+    - remote::sambanova
+    memory:
+    - inline::faiss
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -0,0 +1,16 @@
+services:
+  llamastack:
+    image: llamastack/distribution-sambanova
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      - ./run.yaml:/root/llamastack-run-sambanova.yaml
+    ports:
+      - "5000:5000"
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -0,0 +1,83 @@
+version: '2'
+image_name: sambanova
+docker_image: null
+conda_env: sambanova
+apis:
+- agents
+- inference
+- memory
+- safety
+- telemetry
+providers:
+  inference:
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1/
+      api_key: ${env.SAMBANOVA_API_KEY}
+  memory:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/registry.db
+models:
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: null
+  provider_model_id: Meta-Llama-3.1-8B-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-70B-Instruct
+  provider_id: null
+  provider_model_id: Meta-Llama-3.1-70B-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct
+  provider_id: null
+  provider_model_id: Meta-Llama-3.1-405B-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-1B-Instruct
+  provider_id: null
+  provider_model_id: Meta-Llama-3.2-1B-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: null
+  provider_model_id: Meta-Llama-3.2-3B-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: null
+  provider_model_id: Llama-3.2-11B-Vision-Instruct
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: null
+  provider_model_id: Llama-3.2-90B-Vision-Instruct
+shields:
+- params: null
+  shield_id: meta-llama/Llama-Guard-3-8B
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -1,13 +1,11 @@
 services:
  llamastack:
    image: llamastack/distribution-together
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-together.yaml
    ports:
-      - "5000:5000"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
+      - "8321:8321"
+    environment:
+      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
    deploy:
      restart_policy:
        condition: on-failure
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -6,7 +6,7 @@ services:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
-      - "5000:5000"
+      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: local
-docker_image: null
+container_image: null
 conda_env: local
 apis:
 - shields
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_Building_AI_Applications.ipynb
+++ b/docs/notebooks/Llama_Stack_Building_AI_Applications.ipynb
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -486,13 +486,22 @@ class Generator:
        parameters = path_parameters + query_parameters
        parameters += [
            Parameter(
-                name="X-LlamaStack-ProviderData",
+                name="X-LlamaStack-Provider-Data",
                in_=ParameterLocation.Header,
                description="JSON-encoded provider data which will be made available to the adapter servicing the API",
                required=False,
                schema=self.schema_builder.classdef_to_ref(str),
            )
        ]
+        parameters += [
+            Parameter(
+                name="X-LlamaStack-Client-Version",
+                in_=ParameterLocation.Header,
+                description="Version of the client making the request. This is used to ensure that the client and server are compatible.",
+                required=False,
+                schema=self.schema_builder.classdef_to_ref(str),
+            )
+        ]

        # data passed in payload
        if op.request_params:
@ -528,7 +537,6 @@ class Generator:
            success_type_descriptions = {
                item: doc_string.short_description
                for item, doc_string in success_type_docstring.items()
-                if doc_string.short_description
            }
        else:
            # use return type as a single response type
@ -587,6 +595,7 @@ class Generator:
            )
            responses.update(response_builder.build_response(response_options))

+        assert len(responses.keys()) > 0, f"No responses found for {op.name}"
        if op.event_type is not None:
            builder = ContentBuilder(self.schema_builder)
            callbacks = {
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -8,7 +8,6 @@ import collections.abc
 import enum
 import inspect
 import typing
-import uuid
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union

@ -16,12 +15,7 @@ from llama_stack.apis.version import LLAMA_STACK_API_VERSION

 from termcolor import colored

-from ..strong_typing.inspection import (
-    get_signature,
-    is_type_enum,
-    is_type_optional,
-    unwrap_optional_type,
-)
+from ..strong_typing.inspection import get_signature


 def split_prefix(
@ -113,9 +107,6 @@ class EndpointOperation:

    def get_route(self) -> str:
        if self.route is not None:
-            assert (
-                "_" not in self.route
-            ), f"route should not contain underscores: {self.route}"
            return "/".join(["", LLAMA_STACK_API_VERSION, self.route.lstrip("/")])

        route_parts = ["", LLAMA_STACK_API_VERSION, self.name]
@ -181,10 +172,16 @@ def _get_endpoint_functions(
 def _get_defining_class(member_fn: str, derived_cls: type) -> type:
    "Find the class in which a member function is first defined in a class inheritance hierarchy."

+    # This import must be dynamic here
+    from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
+
    # iterate in reverse member resolution order to find most specific class first
    for cls in reversed(inspect.getmro(derived_cls)):
        for name, _ in inspect.getmembers(cls, inspect.isfunction):
            if name == member_fn:
+                # HACK ALERT
+                if cls == RAGToolRuntime:
+                    return ToolRuntime
                return cls

    raise ValidationError(
@ -265,42 +262,16 @@ def get_endpoint_operations(
                    f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                )

-            if is_type_optional(param_type):
-                inner_type: type = unwrap_optional_type(param_type)
-            else:
-                inner_type = param_type
-
-            if prefix == "get" and (
-                inner_type is bool
-                or inner_type is int
-                or inner_type is float
-                or inner_type is str
-                or inner_type is uuid.UUID
-                or is_type_enum(inner_type)
-            ):
-                if parameter.kind == inspect.Parameter.POSITIONAL_ONLY:
-                    if route_params is not None and param_name not in route_params:
-                        raise ValidationError(
-                            f"positional parameter '{param_name}' absent from user-defined route '{route}' for function '{func_name}'"
-                        )
-
-                    # simple type maps to route path element, e.g. /study/{uuid}/{version}
+            if prefix in ["get", "delete"]:
+                if route_params is not None and param_name in route_params:
                    path_params.append((param_name, param_type))
                else:
-                    if route_params is not None and param_name in route_params:
-                        raise ValidationError(
-                            f"query parameter '{param_name}' found in user-defined route '{route}' for function '{func_name}'"
-                        )
-
-                    # simple type maps to key=value pair in query string
                    query_params.append((param_name, param_type))
            else:
                if route_params is not None and param_name in route_params:
-                    raise ValidationError(
-                        f"user-defined route '{route}' for function '{func_name}' has parameter '{param_name}' of composite type: {param_type}"
-                    )
-
-                request_params.append((param_name, param_type))
+                    path_params.append((param_name, param_type))
+                else:
+                    request_params.append((param_name, param_type))

        # check if function has explicit return type
        if signature.return_annotation is inspect.Signature.empty:
@ -335,19 +306,18 @@ def get_endpoint_operations(

            response_type = process_type(return_type)

-        # set HTTP request method based on type of request and presence of payload
-        if not request_params:
            if prefix in ["delete", "remove"]:
                http_method = HTTPMethod.DELETE
-            else:
+            elif prefix == "post":
+                http_method = HTTPMethod.POST
+            elif prefix == "get":
                http_method = HTTPMethod.GET
-        else:
-            if prefix == "set":
+            elif prefix == "set":
                http_method = HTTPMethod.PUT
            elif prefix == "update":
                http_method = HTTPMethod.PATCH
            else:
-                http_method = HTTPMethod.POST
+                raise ValidationError(f"unknown prefix {prefix}")

        result.append(
            EndpointOperation(
--- a/docs/openapi_generator/strong_typing/classdef.py
+++ b/docs/openapi_generator/strong_typing/classdef.py
@ -122,9 +122,16 @@ class JsonSchemaAnyOf(JsonSchemaNode):
    anyOf: List["JsonSchemaAny"]


+@dataclass
+class Discriminator:
+    propertyName: str
+    mapping: Dict[str, str]
+
+
@dataclass
 class JsonSchemaOneOf(JsonSchemaNode):
    oneOf: List["JsonSchemaAny"]
+    discriminator: Optional[Discriminator]


 JsonSchemaAny = Union[
--- a/docs/openapi_generator/strong_typing/inspection.py
+++ b/docs/openapi_generator/strong_typing/inspection.py
@ -342,7 +342,6 @@ def is_type_union(typ: object) -> bool:
    "True if the type annotation corresponds to a union type (e.g. `Union[T1,T2,T3]`)."

    typ = unwrap_annotated_type(typ)
-
    if _is_union_like(typ):
        args = typing.get_args(typ)
        return len(args) > 2 or type(None) not in args
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@ -36,6 +36,7 @@ from typing import (
 )

 import jsonschema
+from typing_extensions import Annotated

 from . import docstring
 from .auxiliary import (
@ -329,7 +330,6 @@ class JsonSchemaGenerator:
        if metadata is not None:
            # type is Annotated[T, ...]
            typ = typing.get_args(data_type)[0]
-
            schema = self._simple_type_to_schema(typ)
            if schema is not None:
                # recognize well-known auxiliary types
@ -446,12 +446,31 @@ class JsonSchemaGenerator:
                ],
            }
        elif origin_type is Union:
-            return {
+            discriminator = None
+            if typing.get_origin(data_type) is Annotated:
+                discriminator = typing.get_args(data_type)[1].discriminator
+            ret = {
                "oneOf": [
                    self.type_to_schema(union_type)
                    for union_type in typing.get_args(typ)
                ]
            }
+            if discriminator:
+                # for each union type, we need to read the value of the discriminator
+                mapping = {}
+                for union_type in typing.get_args(typ):
+                    props = self.type_to_schema(union_type, force_expand=True)[
+                        "properties"
+                    ]
+                    mapping[props[discriminator]["default"]] = self.type_to_schema(
+                        union_type
+                    )["$ref"]
+
+                ret["discriminator"] = {
+                    "propertyName": discriminator,
+                    "mapping": mapping,
+                }
+            return ret
        elif origin_type is Literal:
            (literal_value,) = typing.get_args(typ)  # unpack value of literal type
            schema = self.type_to_schema(type(literal_value))
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -0,0 +1,123 @@
+## Agent Execution Loop
+
+Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
+
+Each agent turn follows these key steps:
+
+1. **Initial Safety Check**: The user's input is first screened through configured safety shields
+
+2. **Context Retrieval**:
+   - If RAG is enabled, the agent queries relevant documents from memory banks
+   - For new documents, they are first inserted into the memory bank
+   - Retrieved context is augmented to the user's prompt
+
+3. **Inference Loop**: The agent enters its main execution loop:
+   - The LLM receives the augmented prompt (with context and/or previous tool outputs)
+   - The LLM generates a response, potentially with tool calls
+   - If tool calls are present:
+     - Tool inputs are safety-checked
+     - Tools are executed (e.g., web search, code execution)
+     - Tool responses are fed back to the LLM for synthesis
+   - The loop continues until:
+     - The LLM provides a final response without tool calls
+     - Maximum iterations are reached
+     - Token limit is exceeded
+
+4. **Final Safety Check**: The agent's final response is screened through safety shields
+
+```{mermaid}
+sequenceDiagram
+    participant U as User
+    participant E as Executor
+    participant M as Memory Bank
+    participant L as LLM
+    participant T as Tools
+    participant S as Safety Shield
+
+    Note over U,S: Agent Turn Start
+    U->>S: 1. Submit Prompt
+    activate S
+    S->>E: Input Safety Check
+    deactivate S
+
+    E->>M: 2.1 Query Context
+    M-->>E: 2.2 Retrieved Documents
+
+    loop Inference Loop
+        E->>L: 3.1 Augment with Context
+        L-->>E: 3.2 Response (with/without tool calls)
+
+        alt Has Tool Calls
+            E->>S: Check Tool Input
+            S->>T: 4.1 Execute Tool
+            T-->>E: 4.2 Tool Response
+            E->>L: 5.1 Tool Response
+            L-->>E: 5.2 Synthesized Response
+        end
+
+        opt Stop Conditions
+            Note over E: Break if:
+            Note over E: - No tool calls
+            Note over E: - Max iterations reached
+            Note over E: - Token limit exceeded
+        end
+    end
+
+    E->>S: Output Safety Check
+    S->>U: 6. Final Response
+```
+
+Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
+
+```python
+from llama_stack_client.lib.agents.event_logger import EventLogger
+
+agent_config = AgentConfig(
+    model="Llama3.2-3B-Instruct",
+    instructions="You are a helpful assistant",
+    # Enable both RAG and tool usage
+    toolgroups=[
+        {"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}.
+        "builtin::code_interpreter",
+    ],
+    # Configure safety
+    input_shields=["llama_guard"],
+    output_shields=["llama_guard"],
+    # Control the inference loop
+    max_infer_iters=5,
+    sampling_params={
+        "strategy": {
+            "type": "top_p",
+            "temperature": 0.7,
+            "top_p": 0.95
+        },
+        "max_tokens": 2048
+    }
+)
+
+agent = Agent(client, agent_config)
+session_id = agent.create_session("monitored_session")
+
+# Stream the agent's execution steps
+response = agent.create_turn(
+    messages=[{"role": "user", "content": "Analyze this code and run it"}],
+    attachments=[{
+        "content": "https://raw.githubusercontent.com/example/code.py",
+        "mime_type": "text/plain"
+    }],
+    session_id=session_id
+)
+
+# Monitor each step of execution
+for log in EventLogger().log(response):
+    if log.event.step_type == "memory_retrieval":
+        print("Retrieved context:", log.event.retrieved_context)
+    elif log.event.step_type == "inference":
+        print("LLM output:", log.event.model_response)
+    elif log.event.step_type == "tool_execution":
+        print("Tool call:", log.event.tool_call)
+        print("Tool response:", log.event.tool_response)
+    elif log.event.step_type == "shield_call":
+        if log.event.violation:
+            print("Safety violation:", log.event.violation)
+```
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -1,8 +1,8 @@
-# Benchmark Evaluations
+# Evals

 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)

-Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs. Check out our [Colab notebook](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) on working examples on how you can use Llama Stack for running benchmark evaluations.
+Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs.

 ### 1. Open Benchmark Model Evaluation

@ -56,9 +56,10 @@ response = client.eval.evaluate_rows(
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
            "sampling_params": {
-                "temperature": 0.0,
+                "strategy": {
+                    "type": "greedy",
+                },
                "max_tokens": 4096,
-                "top_p": 0.9,
                "repeat_penalty": 1.0,
            },
            "system_message": system_message
@ -113,9 +114,10 @@ response = client.eval.evaluate_rows(
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
            "sampling_params": {
-                "temperature": 0.0,
+                "strategy": {
+                    "type": "greedy",
+                },
                "max_tokens": 4096,
-                "top_p": 0.9,
                "repeat_penalty": 1.0,
            },
        }
@ -134,9 +136,9 @@ agent_config = {
    "model": "meta-llama/Llama-3.1-405B-Instruct",
    "instructions": "You are a helpful assistant",
    "sampling_params": {
-        "strategy": "greedy",
-        "temperature": 0.0,
-        "top_p": 0.95,
+        "strategy": {
+            "type": "greedy",
+        },
    },
    "tools": [
        {
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@ -0,0 +1,36 @@
+## Testing & Evaluation
+
+Llama Stack provides built-in tools for evaluating your applications:
+
+1. **Benchmarking**: Test against standard datasets
+2. **Application Evaluation**: Score your application's outputs
+3. **Custom Metrics**: Define your own evaluation criteria
+
+Here's how to set up basic evaluation:
+
+```python
+# Create an evaluation task
+response = client.eval_tasks.register(
+    eval_task_id="my_eval",
+    dataset_id="my_dataset",
+    scoring_functions=["accuracy", "relevance"]
+)
+
+# Run evaluation
+job = client.eval.run_eval(
+    task_id="my_eval",
+    task_config={
+        "type": "app",
+        "eval_candidate": {
+            "type": "agent",
+            "config": agent_config
+        }
+    }
+)
+
+# Get results
+result = client.eval.job_result(
+    task_id="my_eval",
+    job_id=job.job_id
+)
+```
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,421 +1,29 @@
 # Building AI Applications

-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F2ksmkoGQPa4pzRjMOE6BXWeOxWFIW6n?usp=sharing)
+Llama Stack provides all the building blocks needed to create sophisticated AI applications.

-Llama Stack provides all the building blocks needed to create sophisticated AI applications. This guide will walk you through how to use these components effectively. Check out our Colab notebook on to follow along working examples on how you can build LLM-powered agentic applications using Llama Stack.
+The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.

-## Basic Inference
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)

-The foundation of any AI application is the ability to interact with LLM models. Llama Stack provides a simple interface for both completion and chat-based inference:
+Here are some key topics that will help you build effective agents:

-```python
-from llama_stack_client import LlamaStackClient
+- **[Agent Execution Loop](agent_execution_loop)**
+- **[RAG](rag)**
+- **[Safety](safety)**
+- **[Tools](tools)**
+- **[Telemetry](telemetry)**
+- **[Evals](evals)**

-client = LlamaStackClient(base_url="http://localhost:5001")
-
-# List available models
-models = client.models.list()
-
-# Simple chat completion
-response = client.inference.chat_completion(
-    model_id="Llama3.2-3B-Instruct",
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Write a haiku about coding"}
-    ]
-)
-print(response.completion_message.content)
-```
-
-## Adding Memory & RAG
-
-Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks:
-
-1. **Vector Memory Banks**: For semantic search and retrieval
-2. **Key-Value Memory Banks**: For structured data storage
-3. **Keyword Memory Banks**: For basic text search
-4. **Graph Memory Banks**: For relationship-based retrieval
-
-Here's how to set up a vector memory bank for RAG:
-
-```python
-# Register a memory bank
-bank_id = "my_documents"
-response = client.memory_banks.register(
-    memory_bank_id=bank_id,
-    params={
-        "memory_bank_type": "vector",
-        "embedding_model": "all-MiniLM-L6-v2",
-        "chunk_size_in_tokens": 512
-    }
-)
-
-# Insert documents
-documents = [
-    {
-        "document_id": "doc1",
-        "content": "Your document text here",
-        "mime_type": "text/plain"
-    }
-]
-client.memory.insert(bank_id, documents)
-
-# Query documents
-results = client.memory.query(
-    bank_id=bank_id,
-    query="What do you know about...",
-)
-```
-
-## Implementing Safety Guardrails
-
-Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
-
-```python
-# Register a safety shield
-shield_id = "content_safety"
-client.shields.register(
-    shield_id=shield_id,
-    provider_shield_id="llama-guard-basic"
-)
-
-# Run content through shield
-response = client.safety.run_shield(
-    shield_id=shield_id,
-    messages=[{"role": "user", "content": "User message here"}]
-)
-
-if response.violation:
-    print(f"Safety violation detected: {response.violation.user_message}")
-```
-
-## Building Agents
-
-Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
-
-### The Agent Execution Loop
-
-Each agent turn follows these key steps:
-
-1. **Initial Safety Check**: The user's input is first screened through configured safety shields
-
-2. **Context Retrieval**:
-   - If RAG is enabled, the agent queries relevant documents from memory banks
-   - For new documents, they are first inserted into the memory bank
-   - Retrieved context is augmented to the user's prompt
-
-3. **Inference Loop**: The agent enters its main execution loop:
-   - The LLM receives the augmented prompt (with context and/or previous tool outputs)
-   - The LLM generates a response, potentially with tool calls
-   - If tool calls are present:
-     - Tool inputs are safety-checked
-     - Tools are executed (e.g., web search, code execution)
-     - Tool responses are fed back to the LLM for synthesis
-   - The loop continues until:
-     - The LLM provides a final response without tool calls
-     - Maximum iterations are reached
-     - Token limit is exceeded
-
-4. **Final Safety Check**: The agent's final response is screened through safety shields
-
-```{mermaid}
-sequenceDiagram
-    participant U as User
-    participant E as Executor
-    participant M as Memory Bank
-    participant L as LLM
-    participant T as Tools
-    participant S as Safety Shield
-
-    Note over U,S: Agent Turn Start
-    U->>S: 1. Submit Prompt
-    activate S
-    S->>E: Input Safety Check
-    deactivate S
-
-    E->>M: 2.1 Query Context
-    M-->>E: 2.2 Retrieved Documents
-
-    loop Inference Loop
-        E->>L: 3.1 Augment with Context
-        L-->>E: 3.2 Response (with/without tool calls)
-
-        alt Has Tool Calls
-            E->>S: Check Tool Input
-            S->>T: 4.1 Execute Tool
-            T-->>E: 4.2 Tool Response
-            E->>L: 5.1 Tool Response
-            L-->>E: 5.2 Synthesized Response
-        end
-
-        opt Stop Conditions
-            Note over E: Break if:
-            Note over E: - No tool calls
-            Note over E: - Max iterations reached
-            Note over E: - Token limit exceeded
-        end
-    end
-
-    E->>S: Output Safety Check
-    S->>U: 6. Final Response
-```
-
-Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
-
-```python
-from llama_stack_client.lib.agents.event_logger import EventLogger
-
-agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
-    instructions="You are a helpful assistant",
-    # Enable both RAG and tool usage
-    tools=[
-        {
-            "type": "memory",
-            "memory_bank_configs": [{
-                "type": "vector",
-                "bank_id": "my_docs"
-            }],
-            "max_tokens_in_context": 4096
-        },
-        {
-            "type": "code_interpreter",
-            "enable_inline_code_execution": True
-        }
-    ],
-    # Configure safety
-    input_shields=["content_safety"],
-    output_shields=["content_safety"],
-    # Control the inference loop
-    max_infer_iters=5,
-    sampling_params={
-        "temperature": 0.7,
-        "max_tokens": 2048
-    }
-)
-
-agent = Agent(client, agent_config)
-session_id = agent.create_session("monitored_session")
-
-# Stream the agent's execution steps
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Analyze this code and run it"}],
-    attachments=[{
-        "content": "https://raw.githubusercontent.com/example/code.py",
-        "mime_type": "text/plain"
-    }],
-    session_id=session_id
-)
-
-# Monitor each step of execution
-for log in EventLogger().log(response):
-    if log.event.step_type == "memory_retrieval":
-        print("Retrieved context:", log.event.retrieved_context)
-    elif log.event.step_type == "inference":
-        print("LLM output:", log.event.model_response)
-    elif log.event.step_type == "tool_execution":
-        print("Tool call:", log.event.tool_call)
-        print("Tool response:", log.event.tool_response)
-    elif log.event.step_type == "shield_call":
-        if log.event.violation:
-            print("Safety violation:", log.event.violation)
-```
-
-This example shows how an agent can: Llama Stack provides a high-level agent framework:
-
-```python
-from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.types.agent_create_params import AgentConfig
-
-# Configure an agent
-agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
-    instructions="You are a helpful assistant",
-    tools=[
-        {
-            "type": "memory",
-            "memory_bank_configs": [],
-            "query_generator_config": {
-                "type": "default",
-                "sep": " "
-            }
-        }
-    ],
-    input_shields=["content_safety"],
-    output_shields=["content_safety"],
-    enable_session_persistence=True
-)
-
-# Create an agent
-agent = Agent(client, agent_config)
-session_id = agent.create_session("my_session")
-
-# Run agent turns
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Your question here"}],
-    session_id=session_id
-)
-```
-
-### Adding Tools to Agents
-
-Agents can be enhanced with various tools:
-
-1. **Search**: Web search capabilities through providers like Brave
-2. **Code Interpreter**: Execute code snippets
-3. **RAG**: Memory and document retrieval
-4. **Function Calling**: Custom function execution
-5. **WolframAlpha**: Mathematical computations
-6. **Photogen**: Image generation
-
-Example of configuring an agent with tools:
-
-```python
-agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
-    tools=[
-        {
-            "type": "brave_search",
-            "api_key": "YOUR_API_KEY",
-            "engine": "brave"
-        },
-        {
-            "type": "code_interpreter",
-            "enable_inline_code_execution": True
-        }
-    ],
-    tool_choice="auto",
-    tool_prompt_format="json"
-)
-```
-
-## Building RAG-Enhanced Agents
-
-One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
-
-```python
-from llama_stack_client.types import Attachment
-
-# Create attachments from documents
-attachments = [
-    Attachment(
-        content="https://raw.githubusercontent.com/example/doc.rst",
-        mime_type="text/plain"
-    )
-]
-
-# Configure agent with memory
-agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
-    instructions="You are a helpful assistant",
-    tools=[{
-        "type": "memory",
-        "memory_bank_configs": [],
-        "query_generator_config": {"type": "default", "sep": " "},
-        "max_tokens_in_context": 4096,
-        "max_chunks": 10
-    }],
-    enable_session_persistence=True
-)
-
-agent = Agent(client, agent_config)
-session_id = agent.create_session("rag_session")
-
-# Initial document ingestion
-response = agent.create_turn(
-    messages=[{
-        "role": "user",
-        "content": "I am providing some documents for reference."
-    }],
-    attachments=attachments,
-    session_id=session_id
-)
-
-# Query with RAG
-response = agent.create_turn(
-    messages=[{
-        "role": "user",
-        "content": "What are the key topics in the documents?"
-    }],
-    session_id=session_id
-)
-```
-
-## Testing & Evaluation
-
-Llama Stack provides built-in tools for evaluating your applications:
-
-1. **Benchmarking**: Test against standard datasets
-2. **Application Evaluation**: Score your application's outputs
-3. **Custom Metrics**: Define your own evaluation criteria
-
-Here's how to set up basic evaluation:
-
-```python
-# Create an evaluation task
-response = client.eval_tasks.register(
-    eval_task_id="my_eval",
-    dataset_id="my_dataset",
-    scoring_functions=["accuracy", "relevance"]
-)
-
-# Run evaluation
-job = client.eval.run_eval(
-    task_id="my_eval",
-    task_config={
-        "type": "app",
-        "eval_candidate": {
-            "type": "agent",
-            "config": agent_config
-        }
-    }
-)
-
-# Get results
-result = client.eval.job_result(
-    task_id="my_eval",
-    job_id=job.job_id
-)
-```
-
-## Debugging & Monitoring
-
-Llama Stack includes comprehensive telemetry for debugging and monitoring your applications:
-
-1. **Tracing**: Track request flows across components
-2. **Metrics**: Measure performance and usage
-3. **Logging**: Debug issues and track behavior
-
-The telemetry system supports multiple output formats:
-
- OpenTelemetry for visualization in tools like Jaeger
- SQLite for local storage and querying
- Console output for development
-
-Example of querying traces:
-
-```python
-# Query traces for a session
-traces = client.telemetry.query_traces(
-    attribute_filters=[{
-        "key": "session_id",
-        "op": "eq",
-        "value": session_id
-    }]
-)
-
-# Get spans within the root span; indexed by ID
-# Use parent_span_id to build a tree out of it
-spans_by_id = client.telemetry.get_span_tree(
-    span_id=traces[0].root_span_id
-)
-```
-
-For details on how to use the telemetry system to debug your applications, export traces to a dataset, and run evaluations, see the [Telemetry](telemetry) section.

 ```{toctree}
 :hidden:
-:maxdepth: 3
+:maxdepth: 1

+agent_execution_loop
+rag
+safety
+tools
 telemetry
+evals
 ```
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -0,0 +1,92 @@
+## Memory & RAG
+
+Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks:
+
+1. **Vector Memory Banks**: For semantic search and retrieval
+2. **Key-Value Memory Banks**: For structured data storage
+3. **Keyword Memory Banks**: For basic text search
+4. **Graph Memory Banks**: For relationship-based retrieval
+
+Here's how to set up a vector memory bank for RAG:
+
+```python
+# Register a memory bank
+bank_id = "my_documents"
+response = client.memory_banks.register(
+    memory_bank_id=bank_id,
+    params={
+        "memory_bank_type": "vector",
+        "embedding_model": "all-MiniLM-L6-v2",
+        "chunk_size_in_tokens": 512
+    }
+)
+
+# Insert documents
+documents = [
+    {
+        "document_id": "doc1",
+        "content": "Your document text here",
+        "mime_type": "text/plain"
+    }
+]
+client.memory.insert(bank_id, documents)
+
+# Query documents
+results = client.memory.query(
+    bank_id=bank_id,
+    query="What do you know about...",
+)
+```
+
+
+### Building RAG-Enhanced Agents
+
+One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
+
+```python
+from llama_stack_client.types import Attachment
+
+# Create attachments from documents
+attachments = [
+    Attachment(
+        content="https://raw.githubusercontent.com/example/doc.rst",
+        mime_type="text/plain"
+    )
+]
+
+# Configure agent with memory
+agent_config = AgentConfig(
+    model="Llama3.2-3B-Instruct",
+    instructions="You are a helpful assistant",
+    tools=[{
+        "type": "memory",
+        "memory_bank_configs": [],
+        "query_generator_config": {"type": "default", "sep": " "},
+        "max_tokens_in_context": 4096,
+        "max_chunks": 10
+    }],
+    enable_session_persistence=True
+)
+
+agent = Agent(client, agent_config)
+session_id = agent.create_session("rag_session")
+
+# Initial document ingestion
+response = agent.create_turn(
+    messages=[{
+        "role": "user",
+        "content": "I am providing some documents for reference."
+    }],
+    attachments=attachments,
+    session_id=session_id
+)
+
+# Query with RAG
+response = agent.create_turn(
+    messages=[{
+        "role": "user",
+        "content": "What are the key topics in the documents?"
+    }],
+    session_id=session_id
+)
+```
--- a/docs/source/building_applications/safety.md
+++ b/docs/source/building_applications/safety.md
@ -0,0 +1,21 @@
+## Safety Guardrails
+
+Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
+
+```python
+# Register a safety shield
+shield_id = "content_safety"
+client.shields.register(
+    shield_id=shield_id,
+    provider_shield_id="llama-guard-basic"
+)
+
+# Run content through shield
+response = client.safety.run_shield(
+    shield_id=shield_id,
+    messages=[{"role": "user", "content": "User message here"}]
+)
+
+if response.violation:
+    print(f"Safety violation detected: {response.violation.user_message}")
+```
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -1,14 +1,7 @@
-# Telemetry
-```{note}
-The telemetry system is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
-```
-
-
+## Telemetry

 The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output.

-## Key Concepts
-
 ### Events
 The telemetry system supports three main types of events:

@ -44,67 +37,15 @@ structured_log_event = SpanStartPayload(
 - **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API.
 - **Console**: Print events to the console.

-## APIs
+### Providers

-The telemetry API is designed to be flexible for different user flows like debugging/visualization in UI, monitoring, and saving traces to datasets.
-The telemetry system exposes the following HTTP endpoints:
-
-### Log Event
-```http
-POST /telemetry/log-event
-```
-Logs a telemetry event (unstructured log, metric, or structured log) with optional TTL.
-
-### Query Traces
-```http
-POST /telemetry/query-traces
-```
-Retrieves traces based on filters with pagination support. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `limit`: Maximum number of traces to return (default: 100)
- `offset`: Number of traces to skip (default: 0)
- `order_by`: List of fields to sort by
-
-### Get Span Tree
-```http
-POST /telemetry/get-span-tree
-```
-Retrieves a hierarchical view of spans starting from a specific span. Parameters:
- `span_id`: ID of the root span to retrieve
- `attributes_to_return`: Optional list of specific attributes to include
- `max_depth`: Optional maximum depth of the span tree to return
-
-### Query Spans
-```http
-POST /telemetry/query-spans
-```
-Retrieves spans matching specified filters and returns selected attributes. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `attributes_to_return`: List of specific attributes to include in results
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
-
-Returns a flattened list of spans with requested attributes.
-
-### Save Spans to Dataset
-This is useful for saving traces to a dataset for running evaluations. For example, you can save the input/output of each span that is part of an agent session/turn to a dataset and then run an eval task on it. See example in [Example: Save Spans to Dataset](#example-save-spans-to-dataset).
-```http
-POST /telemetry/save-spans-to-dataset
-```
-Queries spans and saves their attributes to a dataset. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `attributes_to_save`: List of span attributes to save to the dataset
- `dataset_id`: ID of the dataset to save to
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
-
-## Providers
-
-### Meta-Reference Provider
+#### Meta-Reference Provider
 Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types:
 1) OpenTelemetry Collector
 2) SQLite
 3) Console

-## Configuration
+#### Configuration

 Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one.
 ```yaml
@ -117,7 +58,7 @@ Here's an example that sends telemetry signals to all three sink types. Your con
      sqlite_db_path: "/path/to/telemetry.db"
 ```

-## Jaeger to visualize traces
+### Jaeger to visualize traces

 The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.

@ -131,112 +72,6 @@ $ docker run --rm --name jaeger \

 Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/.

-## Querying Traces Stored in SQLIte
+### Querying Traces Stored in SQLite

-The `sqlite` sink allows you to query traces without an external system. Here are some example queries:
-
-Querying Traces for a agent session
-The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
-
-``` bash
- curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \
-H 'Content-Type: application/json' \
-d '{
-  "attribute_filters": [
-    {
-      "key": "session_id",
-      "op": "eq",
-      "value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" }],
-  "limit": 100,
-  "offset": 0,
-  "order_by": ["start_time"]
-
-  [
-  {
-    "trace_id": "6902f54b83b4b48be18a6f422b13e16f",
-    "root_span_id": "5f37b85543afc15a",
-    "start_time": "2024-12-04T08:08:30.501587",
-    "end_time": "2024-12-04T08:08:36.026463"
-  },
-  ........
-]
-}'
-
-```
-
-Querying spans for a specifc root span id
-
-``` bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
-H 'Content-Type: application/json' \
-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
-
-{
-  "span_id": "6cceb4b48a156913",
-  "trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
-  "parent_span_id": "892a66d726c7f990",
-  "name": "retrieve_rag_context",
-  "start_time": "2024-12-04T09:28:21.781995",
-  "end_time": "2024-12-04T09:28:21.913352",
-  "attributes": {
-    "input": [
-      "{\"role\":\"system\",\"content\":\"You are a helpful assistant\"}",
-      "{\"role\":\"user\",\"content\":\"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.\",\"context\":null}"
-    ]
-  },
-  "children": [
-    {
-      "span_id": "1a2df181854064a8",
-      "trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
-      "parent_span_id": "6cceb4b48a156913",
-      "name": "MemoryRouter.query_documents",
-      "start_time": "2024-12-04T09:28:21.787620",
-      "end_time": "2024-12-04T09:28:21.906512",
-      "attributes": {
-        "input": null
-      },
-      "children": [],
-      "status": "ok"
-    }
-  ],
-  "status": "ok"
-}
-
-```
-
-## Example: Save Spans to Dataset
-Save all spans for a specific agent session to a dataset.
-``` bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
-H 'Content-Type: application/json' \
-d '{
-    "attribute_filters": [
-        {
-            "key": "session_id",
-            "op": "eq",
-            "value": "dd667b87-ca4b-4d30-9265-5a0de318fc65"
-        }
-    ],
-    "attributes_to_save": ["input", "output"],
-    "dataset_id": "my_dataset",
-    "max_depth": 10
-}'
-```
-
-Save all spans for a specific agent turn to a dataset.
-```bash
-curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
-H 'Content-Type: application/json' \
-d '{
-    "attribute_filters": [
-        {
-            "key": "turn_id",
-            "op": "eq",
-            "value": "123e4567-e89b-12d3-a456-426614174000"
-        }
-    ],
-    "attributes_to_save": ["input", "output"],
-    "dataset_id": "my_dataset",
-    "max_depth": 10
-}'
-```
+The `sqlite` sink allows you to query traces without an external system. Here are some example queries. Refer to the notebook at [Llama Stack Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on how to query traces and spaces.
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -0,0 +1,202 @@
+# Tools
+
+Tools are functions that can be invoked by an agent to perform tasks. They are organized into tool groups and registered with specific providers. Each tool group represents a collection of related tools from a single provider. They are organized into groups so that state can be externalized: the collection operates on the same state typically.
+An example of this would be a "db_access" tool group that contains tools for interacting with a database. "list_tables", "query_table", "insert_row" could be examples of tools in this group.
+
+Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
+
+When instatiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.
+
+Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
+
+## Types of Tool Group providers
+
+There are three types of providers for tool groups that are supported by Llama Stack.
+
+1. Built-in providers
+2. Model Context Protocol (MCP) providers
+3. Client provided tools
+
+### Built-in providers
+
+Built-in providers come packaged with Llama Stack. These providers provide common functionalities like web search, code interpretation, and computational capabilities.
+
+#### Web Search providers
+There are three web search providers that are supported by Llama Stack.
+
+1. Brave Search
+2. Bing Search
+3. Tavily Search
+
+Example client SDK call to register a "websearch" toolgroup that is provided by brave-search.
+
+```python
+# Register Brave Search tool group
+client.toolgroups.register(
+    toolgroup_id="builtin::websearch",
+    provider_id="brave-search",
+    args={"max_results": 5}
+)
+```
+
+The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
+
+
+
+#### Code Interpreter
+
+The Code Interpreter allows execution of Python code within a controlled environment.
+
+```python
+# Register Code Interpreter tool group
+client.toolgroups.register(
+    toolgroup_id="builtin::code_interpreter",
+    provider_id="code_interpreter"
+)
+```
+
+Features:
+- Secure execution environment using `bwrap` sandboxing
+- Matplotlib support for generating plots
+- Disabled dangerous system operations
+- Configurable execution timeouts
+
+#### WolframAlpha
+
+The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
+
+```python
+# Register WolframAlpha tool group
+client.toolgroups.register(
+    toolgroup_id="builtin::wolfram_alpha",
+    provider_id="wolfram-alpha"
+)
+```
+
+Example usage:
+```python
+result = client.tool_runtime.invoke_tool(
+    tool_name="wolfram_alpha",
+    args={"query": "solve x^2 + 2x + 1 = 0"}
+)
+```
+
+#### Memory
+
+The Memory tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).
+
+```python
+# Register Memory tool group
+client.toolgroups.register(
+    toolgroup_id="builtin::memory",
+    provider_id="memory",
+    args={
+        "max_chunks": 5,
+        "max_tokens_in_context": 4096
+    }
+)
+```
+
+Features:
+- Support for multiple memory bank types
+- Configurable query generation
+- Context retrieval with token limits
+
+
+> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and memory, that are provided by tavily-search, code-interpreter and memory providers.
+
+## Model Context Protocol (MCP) Tools
+
+MCP tools are special tools that can interact with llama stack over model context protocol. These tools are dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.
+
+Refer to https://github.com/modelcontextprotocol/server for available MCP servers.
+
+```python
+# Register MCP tools
+client.toolgroups.register(
+    toolgroup_id="builtin::filesystem",
+    provider_id="model-context-protocol",
+    mcp_endpoint=URL(uri="http://localhost:8000/sse"),
+)
+```
+
+MCP tools require:
+- A valid MCP endpoint URL
+- The endpoint must implement the Model Context Protocol
+- Tools are discovered dynamically from the endpoint
+
+
+## Tools provided by the client
+
+These tools are registered along with the agent config and are specific to the agent for which they are registered. The main difference between these tools and the tools provided by the built-in providers is that the execution of these tools is handled by the client and the agent transfers the tool call to the client and waits for the result from the client.
+
+```python
+# Example agent config with client provided tools
+config = AgentConfig(
+    toolgroups=[
+        "builtin::websearch",
+    ],
+    client_tools=[
+        ToolDef(name="client_tool", description="Client provided tool")
+    ]
+)
+```
+
+Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
+
+## Tool Structure
+
+Each tool has the following components:
+
+- `name`: Unique identifier for the tool
+- `description`: Human-readable description of the tool's functionality
+- `parameters`: List of parameters the tool accepts
+  - `name`: Parameter name
+  - `parameter_type`: Data type (string, number, etc.)
+  - `description`: Parameter description
+  - `required`: Whether the parameter is required (default: true)
+  - `default`: Default value if any
+
+Example tool definition:
+```python
+{
+    "name": "web_search",
+    "description": "Search the web for information",
+    "parameters": [
+        {
+            "name": "query",
+            "parameter_type": "string",
+            "description": "The query to search for",
+            "required": True
+        }
+    ]
+}
+```
+
+## Tool Invocation
+
+Tools can be invoked using the `invoke_tool` method:
+
+```python
+result = client.tool_runtime.invoke_tool(
+    tool_name="web_search",
+    kwargs={"query": "What is the capital of France?"}
+)
+```
+
+The result contains:
+- `content`: The tool's output
+- `error_message`: Optional error message if the tool failed
+- `error_code`: Optional error code if the tool failed
+
+## Listing Available Tools
+
+You can list all available tools or filter by tool group:
+
+```python
+# List all tools
+all_tools = client.tools.list_tools()
+
+# List tools in a specific group
+group_tools = client.tools.list_tools(toolgroup_id="search_tools")
+```
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -10,7 +10,6 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
 - **Inference**: run inference with a LLM
 - **Safety**: apply safety policies to the output at a Systems (not only model) level
 - **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
- **Memory**: store and retrieve data for RAG, chat history, etc.
 - **DatasetIO**: interface with datasets and data loaders
 - **Scoring**: evaluate outputs of the system
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
@ -24,22 +23,23 @@ We are working on adding a few more APIs to complete the application lifecycle.

 ## API Providers

-The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Obvious examples for these include
- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, etc.),
+The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
+- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, etc.),
+- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.),
 - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)

 Providers come in two flavors:
 - **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
 - **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.

+Most importantly, Llama Stack always strives to provide at least one fully "local" provider for each API so you can iterate on a fully featured environment locally.
 ## Resources

 Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:

 - **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
 - **Safety** is associated with `Shield` resources.
- **Memory** is associated with `Memory Bank` resources.
+- **Tool Runtime** is associated with `ToolGroup` resources.
 - **DatasetIO** is associated with `Dataset` resources.
 - **Scoring** is associated with `ScoringFunction` resources.
 - **Eval** is associated with `Model` and `EvalTask` resources.
@ -58,17 +58,14 @@ While there is a lot of flexibility to mix-and-match providers, often users will

 **Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.

-**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Cerebras, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
+**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.


 **On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)

-## More Concepts
- [Evaluation Concepts](evaluation_concepts.md)
-
 ```{toctree}
 :maxdepth: 1
 :hidden:

-evaluation_concepts
+distributions/index
 ```
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -1,9 +1,14 @@
 # Contributing to Llama Stack

+Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
+
+- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
+- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.

 ```{toctree}
 :maxdepth: 1
+:hidden:

 new_api_provider
-memory_api
+testing
 ```
--- a/docs/source/contributing/memory_api.md
+++ b/docs/source/contributing/memory_api.md
@ -1,53 +0,0 @@
-# Memory API Providers
-
-This guide gives you references to switch between different memory API providers.
-
-##### pgvector
-1. Start running the pgvector server:
-
-```
-$ docker run --network host --name mypostgres -it -p 5432:5432 -e POSTGRES_PASSWORD=mysecretpassword -e POSTGRES_USER=postgres -e POSTGRES_DB=postgres pgvector/pgvector:pg16
-```
-
-2. Edit the `run.yaml` file to point to the pgvector server.
-```
-memory:
-  - provider_id: pgvector
-    provider_type: remote::pgvector
-    config:
-      host: 127.0.0.1
-      port: 5432
-      db: postgres
-      user: postgres
-      password: mysecretpassword
-```
-
-> [!NOTE]
-> If you get a `RuntimeError: Vector extension is not installed.`. You will need to run `CREATE EXTENSION IF NOT EXISTS vector;` to include the vector extension. E.g.
-
-```
-docker exec -it mypostgres ./bin/psql -U postgres
-postgres=# CREATE EXTENSION IF NOT EXISTS vector;
-postgres=# SELECT extname from pg_extension;
- extname
-```
-
-3. Run `docker compose up` with the updated `run.yaml` file.
-
-##### chromadb
-1. Start running chromadb server
-```
-docker run -it --network host --name chromadb -p 6000:6000 -v ./chroma_vdb:/chroma/chroma -e IS_PERSISTENT=TRUE chromadb/chroma:latest
-```
-
-2. Edit the `run.yaml` file to point to the chromadb server.
-```
-memory:
-  - provider_id: remote::chromadb
-    provider_type: remote::chromadb
-    config:
-      host: localhost
-      port: 6000
-```
-
-3. Run `docker compose up` with the updated `run.yaml` file.
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -1,26 +1,41 @@
 # Adding a New API Provider

-This guide contains references to walk you through adding a new API provider.
+This guide will walk you through the process of adding a new API provider to Llama Stack.

-1. First, decide which API your provider falls into (e.g. Inference, Safety, Agents, Memory).
-2. Decide whether your provider is a remote provider, or inline implementation. A remote provider is a provider that makes a remote request to a service. An inline provider is a provider where implementation is executed locally. Checkout the examples, and follow the structure to add your own API provider. Please find the following code pointers:

-    - {repopath}`Remote Providers::llama_stack/providers/remote`
-    - {repopath}`Inline Providers::llama_stack/providers/inline`
+- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
+- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
+- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary.

-3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) with your API provider.
-4. Test your code!

-## Testing your newly added API providers
+Here are some example PRs to help you get started:
+   - [Grok Inference Implementation](https://github.com/meta-llama/llama-stack/pull/609)
+   - [Nvidia Inference Implementation](https://github.com/meta-llama/llama-stack/pull/355)
+   - [Model context protocol Tool Runtime](https://github.com/meta-llama/llama-stack/pull/665)

-1. Start with an _integration test_ for your provider. That means we will instantiate the real provider, pass it real configuration and if it is a remote service, we will actually hit the remote service. We **strongly** discourage mocking for these tests at the provider level. Llama Stack is first and foremost about integration so we need to make sure stuff works end-to-end. See {repopath}`llama_stack/providers/tests/inference/test_text_inference.py` for an example.

-2. In addition, if you want to unit test functionality within your provider, feel free to do so. You can find some tests in `tests/` but they aren't well-supported so far.
+## Testing the Provider

-3. Test with a client-server Llama Stack setup. (a) Start a Llama Stack server with your own distribution which includes the new provider. (b) Send a client request to the server. See `llama_stack/apis/<api>/client.py` for how this is done. These client scripts can serve as lightweight tests.
+### 1. Integration Testing
+- Create integration tests that use real provider instances and configurations
+- For remote services, test actual API interactions
+- Avoid mocking at the provider level since adapter layers tend to be thin
+- Reference examples in {repopath}`tests/client-sdk`

-You can find more complex client scripts [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repo. Note down which scripts works and do not work with your distribution.
+### 2. Unit Testing (Optional)
+- Add unit tests for provider-specific functionality
+- See examples in {repopath}`llama_stack/providers/tests/inference/test_text_inference.py`

-## Submit your PR
+### 3. End-to-End Testing
+1. Start a Llama Stack server with your new provider
+2. Test using client requests
+3. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
+4. Document which scripts are compatible with your provider

-After you have fully tested your newly added API provider, submit a PR with the attached test plan. You must have a Test Plan in the summary section of your PR.
+## Submitting Your PR
+
+1. Ensure all tests pass
+2. Include a comprehensive test plan in your PR summary
+3. Document any known limitations or considerations
+4. Submit your pull request for review
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -0,0 +1,6 @@
+# Testing Llama Stack
+
+Tests are of three different kinds:
+- Unit tests
+- Provider focused integration tests
+- Client SDK tests
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -4,7 +4,7 @@
 This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.


-## Llama Stack Build
+### Llama Stack Build

 In order to build your own distribution, we recommend you clone the `llama-stack` repository.

@ -13,29 +13,99 @@ In order to build your own distribution, we recommend you clone the `llama-stack
 git clone git@github.com:meta-llama/llama-stack.git
 cd llama-stack
 pip install -e .
-
-llama stack build -h
 ```
+Use the CLI to build your distribution.
+The main points to consider are:
+1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
+2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
+3. **Config** - Do you want to use a pre-existing config file to build your distribution?

-We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify:
- `name`: the name for our distribution (e.g. `my-stack`)
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers
-  - `description`: a short description of the configurations for the distribution
-  - `providers`: specifies the underlying implementation for serving each API endpoint
-  - `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
+```
+llama stack build -h
+
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates | --no-list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME]
+
+Build a Llama stack container
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distribution/**/build.yaml.
+                        If this argument is not provided, you will be prompted to enter information interactively
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates
+  --list-templates, --no-list-templates
+                        Show the available templates for building a Llama Stack distribution (default: False)
+  --image-type {conda,container,venv}
+                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.
+  --image-name IMAGE_NAME
+                        [for image-type=conda] Name of the conda environment to use for the build. If
+                        not specified, currently active Conda environment will be used. If no Conda
+                        environment is active, you must specify a name.
+```

 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.

 ::::{tab-set}
+:::{tab-item} Building from a template
+To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
+
+The following command will allow you to see the available templates and their corresponding providers.
+```
+llama stack build --list-templates
+```
+
+```
+------------------------------+-----------------------------------------------------------------------------+
+| Template Name                | Description                                                                 |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+| together                     | Use Together.AI for running LLM inference                                   |
+------------------------------+-----------------------------------------------------------------------------+
+| vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
+------------------------------+-----------------------------------------------------------------------------+
+| experimental-post-training   | Experimental template for post training                                     |
+------------------------------+-----------------------------------------------------------------------------+
+| remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
+------------------------------+-----------------------------------------------------------------------------+
+| fireworks                    | Use Fireworks.AI for running LLM inference                                  |
+------------------------------+-----------------------------------------------------------------------------+
+| tgi                          | Use (an external) TGI server for running LLM inference                      |
+------------------------------+-----------------------------------------------------------------------------+
+| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
+------------------------------+-----------------------------------------------------------------------------+
+| nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
+------------------------------+-----------------------------------------------------------------------------+
+| meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference    |
+------------------------------+-----------------------------------------------------------------------------+
+| cerebras                     | Use Cerebras for running LLM inference                                      |
+------------------------------+-----------------------------------------------------------------------------+
+| ollama                       | Use (an external) Ollama server for running LLM inference                   |
+------------------------------+-----------------------------------------------------------------------------+
+| hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
+```
+
+You may then pick a template to build your distribution with providers fitted to your liking.
+
+For example, to build a distribution with TGI as the inference provider, you can run:
+```
+$ llama stack build --template tgi
+...
+You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
+```
+:::
 :::{tab-item} Building from Scratch

- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
+If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
+
+It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
 ```
 llama stack build

 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
-> Enter the image type you want your Llama Stack to be built as (docker or conda): conda
+> Enter the image type you want your Llama Stack to be built as (container or conda): conda

 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
@ -57,272 +127,6 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
 ```
 :::

-:::{tab-item} Building from a template
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
-
-The following command will allow you to see the available templates and their corresponding providers.
-```
-llama stack build --list-templates
-```
-
-```
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| Template Name                | Providers                              | Description                                                                 |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| tgi                          | {                                      | Use (an external) TGI server for running LLM inference                      |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::tgi"                      |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| remote-vllm                  | {                                      | Use (an external) vLLM server for running LLM inference                     |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::vllm"                     |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| vllm-gpu                     | {                                      | Use a built-in vLLM engine for running LLM inference                        |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "inline::vllm"                     |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-quantized-gpu | {                                      | Use Meta Reference with fp8, int4 quantization for running LLM inference    |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "inline::meta-reference-quantized" |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| meta-reference-gpu           | {                                      | Use Meta Reference for running LLM inference                                |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| hf-serverless                | {                                      | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::hf::serverless"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| together                     | {                                      | Use Together.AI for running LLM inference                                   |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::together"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| ollama                       | {                                      | Use (an external) Ollama server for running LLM inference                   |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::ollama"                   |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| bedrock                      | {                                      | Use AWS Bedrock for running LLM inference and safety                        |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::bedrock"                  |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "remote::bedrock"                  |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| hf-endpoint                  | {                                      | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::hf::endpoint"             |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| fireworks                    | {                                      | Use Fireworks.AI for running LLM inference                                  |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::fireworks"                |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::faiss",                   |                                                                             |
-|                              |     "remote::chromadb",                |                                                                             |
-|                              |     "remote::pgvector"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-| cerebras                     | {                                      | Use Cerebras for running LLM inference                                      |
-|                              |   "inference": [                       |                                                                             |
-|                              |     "remote::cerebras"                 |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "safety": [                          |                                                                             |
-|                              |     "inline::llama-guard"              |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "memory": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "agents": [                          |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ],                                   |                                                                             |
-|                              |   "telemetry": [                       |                                                                             |
-|                              |     "inline::meta-reference"           |                                                                             |
-|                              |   ]                                    |                                                                             |
-|                              | }                                      |                                                                             |
-+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
-```
-
-You may then pick a template to build your distribution with providers fitted to your liking.
-
-For example, to build a distribution with TGI as the inference provider, you can run:
-```
-llama stack build --template tgi
-```
-
-```
-$ llama stack build --template tgi
-...
-You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
-```
-:::
-
 :::{tab-item} Building from a pre-existing build config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.

@ -348,35 +152,39 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
 ```
 :::

-:::{tab-item} Building Docker
+:::{tab-item} Building Container
 > [!TIP]
-> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman.
+> Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.

-To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type.
+To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.

 ```
-llama stack build --template ollama --image-type docker
+llama stack build --template ollama --image-type container
 ```

 ```
-$ llama stack build --template ollama --image-type docker
+$ llama stack build --template ollama --image-type container
 ...
-Dockerfile created successfully in /tmp/tmp.viA3a3Rdsg/DockerfileFROM python:3.10-slim
+Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...

 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```

-After this step is successful, you should be able to find the built docker image and test it with `llama stack run <path/to/run.yaml>`.
+After this step is successful, you should be able to find the built container image and test it with `llama stack run <path/to/run.yaml>`.
 :::

 ::::


-## Running your Stack server
+### Running your Stack server
 Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.

 ```
+# Start using template name
+llama stack run tgi
+
+# Start using config file
 llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```

@ -402,14 +210,14 @@ Serving API agents
 POST /agents/step/get
 POST /agents/turn/get

-Listening on ['::', '0.0.0.0']:5000
+Listening on ['::', '0.0.0.0']:8321
 INFO:     Started server process [2935911]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
-INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```

 ### Troubleshooting

-If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
+If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -70,20 +70,27 @@ Next up is the most critical part: the set of providers that the stack will use
 ```yaml
 providers:
  inference:
+  # provider_id is a string you can choose freely
  - provider_id: ollama
+    # provider_type is a string that specifies the type of provider.
+    # in this case, the provider for inference is ollama and it is run remotely (outside of the distribution)
    provider_type: remote::ollama
+    # config is a dictionary that contains the configuration for the provider.
+    # in this case, the configuration is the url of the ollama server
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
 ```
 A few things to note:
- A _provider instance_ is identified with an (identifier, type, configuration) tuple. The identifier is a string you can choose freely.
+- A _provider instance_ is identified with an (id, type, configuration) triplet.
+- The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
- The configuration dictionary is provider-specific. Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
+- The configuration dictionary is provider-specific.
+- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.

 ## Resources
-```

 Finally, let's look at the `models` section:
+
 ```yaml
 models:
 - metadata: {}
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,11 +1,20 @@
 # Using Llama Stack as a Library

-If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server. For [example](https://github.com/meta-llama/llama-stack-client-python/blob/main/src/llama_stack_client/lib/direct/test.py):
+If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
+```python
+# setup
+pip install llama-stack
+llama stack build --template together --image-type venv
+```

 ```python
-from llama_stack_client.lib.direct.direct import LlamaStackDirectClient
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient

-client = await LlamaStackDirectClient.from_template('ollama')
+client = LlamaStackAsLibraryClient(
+    "ollama",
+    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
+    provider_data = {"tavily_search_api_key": os.environ['TAVILY_SEARCH_API_KEY']}
+)
 await client.initialize()
 ```

@ -14,23 +23,12 @@ This will parse your config and set up any inline implementations and remote cli
 Then, you can access the APIs like `models` and `inference` on the client and call their methods directly:

 ```python
-response = await client.models.list()
-print(response)
-```
-
-```python
-response = await client.inference.chat_completion(
-    messages=[UserMessage(content="What is the capital of France?", role="user")],
-    model_id="Llama3.1-8B-Instruct",
-    stream=False,
-)
-print("\nChat completion response:")
-print(response)
+response = client.models.list()
 ```

 If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html), you can also use the run.yaml configuration file directly:

 ```python
-client = await LlamaStackDirectClient.from_config(config_path)
-await client.initialize()
+client = LlamaStackAsLibraryClient(config_path)
+client.initialize()
 ```
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -1,40 +1,27 @@
-# Starting a Llama Stack
+# Starting a Llama Stack Server
+
+You can run a Llama Stack server in one of the following ways:
+
+**As a Library**:
+
+This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
+
+
+**Docker**:
+
+Another simple way to start interacting with Llama Stack is to just spin up docker which is pre-built with all the providers you need. We provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](distributions/selection) for more details.
+
+
+**Conda**:
+
+Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
 ```{toctree}
-:maxdepth: 3
+:maxdepth: 1
 :hidden:

 importing_as_library
 building_distro
 configuration
 ```
-
-You can instantiate a Llama Stack in one of the following ways:
- **As a Library**: this is the simplest, especially if you are using an external inference service. See [Using Llama Stack as a Library](importing_as_library)
- **Docker**: we provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container.
- **Conda**: finally, you can build a custom Llama Stack server using `llama stack build` containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
-
-Which templates / distributions to choose depends on the hardware you have for running LLM inference.
-
- **Do you have access to a machine with powerful GPUs?**
-If so, we suggest:
-  - {dockerhub}`distribution-remote-vllm` ([Guide](self_hosted_distro/remote-vllm))
-  - {dockerhub}`distribution-meta-reference-gpu` ([Guide](self_hosted_distro/meta-reference-gpu))
-  - {dockerhub}`distribution-tgi` ([Guide](self_hosted_distro/tgi))
-
- **Are you running on a "regular" desktop machine?**
-If so, we suggest:
-  - {dockerhub}`distribution-ollama` ([Guide](self_hosted_distro/ollama))
-
- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
-  - {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
-  - {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
-
- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
-  - [iOS SDK](ondevice_distro/ios_sdk)
-  - [Android](ondevice_distro/android_sdk)
-
- **Do you want a hosted Llama Stack endpoint?** If so, we suggest:
-  - [Remote-Hosted Llama Stack Endpoints](remote_hosted_distro/index)
-
-
-You can also build your own [custom distribution](building_distro).
--- a/docs/source/distributions/ondevice_distro/ios_sdk.md
+++ b/docs/source/distributions/ondevice_distro/ios_sdk.md
@ -1,6 +1,3 @@
---
-orphan: true
---
 # iOS SDK

 We offer both remote and on-device use of Llama Stack in Swift via two components:
@ -27,7 +24,7 @@ If you don't want to run inference on-device, then you can connect to any hosted
 ```swift
 import LlamaStackClient

-let agents = RemoteAgents(url: URL(string: "http://localhost:5000")!)
+let agents = RemoteAgents(url: URL(string: "http://localhost:8321")!)
 let request = Components.Schemas.CreateAgentTurnRequest(
        agent_id: agentId,
        messages: [
--- a/docs/source/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
@ -1,6 +1,3 @@
---
-orphan: true
---
 # Remote-Hosted Distributions

 Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to.
--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -0,0 +1,73 @@
+# NVIDIA Distribution
+
+The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::nvidia` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `meta-llama/Llama-3-8B-Instruct (meta/llama3-8b-instruct)`
+- `meta-llama/Llama-3-70B-Instruct (meta/llama3-70b-instruct)`
+- `meta-llama/Llama-3.1-8B-Instruct (meta/llama-3.1-8b-instruct)`
+- `meta-llama/Llama-3.1-70B-Instruct (meta/llama-3.1-70b-instruct)`
+- `meta-llama/Llama-3.1-405B-Instruct-FP8 (meta/llama-3.1-405b-instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (meta/llama-3.2-1b-instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (meta/llama-3.2-3b-instruct)`
+- `meta-llama/Llama-3.2-11B-Vision-Instruct (meta/llama-3.2-11b-vision-instruct)`
+- `meta-llama/Llama-3.2-90B-Vision-Instruct (meta/llama-3.2-90b-vision-instruct)`
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+
+
+## Running Llama Stack with NVIDIA
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-nvidia \
+  --yaml-config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template nvidia --image-type conda
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --env INFERENCE_MODEL=$INFERENCE_MODEL
+```
--- a/docs/source/distributions/selection.md
+++ b/docs/source/distributions/selection.md
@ -0,0 +1,56 @@
+# List of Distributions
+
+Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.
+
+## Selection of a Distribution / Template
+
+Which templates / distributions to choose depends on the hardware you have for running LLM inference.
+
+- **Do you want a hosted Llama Stack endpoint?** If so, we suggest leveraging our partners who host Llama Stack endpoints. Namely, _fireworks.ai_ and _together.xyz_.
+  - Read more about it here - [Remote-Hosted Endpoints](remote_hosted_distro/index).
+
+
+- **Do you have access to machines with GPUs?** If you wish to run Llama Stack locally or on a cloud instance and host your own Llama Stack endpoint, we suggest:
+  - {dockerhub}`distribution-remote-vllm` ([Guide](self_hosted_distro/remote-vllm))
+  - {dockerhub}`distribution-meta-reference-gpu` ([Guide](self_hosted_distro/meta-reference-gpu))
+  - {dockerhub}`distribution-tgi` ([Guide](self_hosted_distro/tgi))
+  - {dockerhub}`distribution-nvidia` ([Guide](self_hosted_distro/nvidia))
+
+- **Are you running on a "regular" desktop or laptop ?** We suggest using the ollama template for quick prototyping and get started without having to worry about needing GPUs.
+  - {dockerhub}`distribution-ollama` ([link](self_hosted_distro/ollama))
+
+- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?**  If so, we suggest:
+  - {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
+  - {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
+
+- **Do you want to run Llama Stack inference on your iOS / Android device**  Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
+  - [iOS SDK](ondevice_distro/ios_sdk)
+  - [Android](ondevice_distro/android_sdk)
+
+
+- **If none of the above fit your needs, you can also build your own [custom distribution](building_distro).**
+
+### Distribution Details
+
+```{toctree}
+:maxdepth: 1
+
+remote_hosted_distro/index
+self_hosted_distro/remote-vllm
+self_hosted_distro/meta-reference-gpu
+self_hosted_distro/tgi
+self_hosted_distro/nvidia
+self_hosted_distro/ollama
+self_hosted_distro/together
+self_hosted_distro/fireworks
+ondevice_distro/index
+```
+
+### On-Device Distributions
+
+```{toctree}
+:maxdepth: 1
+
+ondevice_distro/ios_sdk
+ondevice_distro/android_sdk
+```
--- a/docs/source/distributions/self_hosted_distro/bedrock.md
+++ b/docs/source/distributions/self_hosted_distro/bedrock.md
@ -15,11 +15,11 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::bedrock` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `remote::bedrock` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |



@ -27,7 +27,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)

 ### Models

--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@ -5,18 +5,21 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
 | inference | `remote::cerebras` |
-| memory | `inline::meta-reference` |
 | safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 ### Environment Variables

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `CEREBRAS_API_KEY`: Cerebras API Key (default: ``)

 ### Models
--- a/docs/source/distributions/self_hosted_distro/dell-tgi.md
+++ b/docs/source/distributions/self_hosted_distro/dell-tgi.md
@ -41,7 +41,7 @@ The script will first start up TGI server, then start up Llama Stack distributio
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
+INFO:     Uvicorn running on http://[::]:8321 (Press CTRL+C to quit)
 ```

 To kill the server
@ -65,7 +65,7 @@ registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1
 #### Start Llama Stack server pointing to TGI server

 ```
-docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
+docker run --network host -it -p 8321:8321 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-tgi --yaml_config /root/my-run.yaml
 ```

 Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -18,34 +18,34 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::fireworks` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 ### Environment Variables

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)

 ### Models

 The following models are available by default:

- `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (fireworks/llama-v3p3-70b-instruct)`
- `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)`
- `meta-llama/Llama-Guard-3-11B-Vision (fireworks/llama-guard-3-11b-vision)`
+- `meta-llama/Llama-3.1-8B-Instruct (accounts/fireworks/models/llama-v3p1-8b-instruct)`
+- `meta-llama/Llama-3.1-70B-Instruct (accounts/fireworks/models/llama-v3p1-70b-instruct)`
+- `meta-llama/Llama-3.1-405B-Instruct-FP8 (accounts/fireworks/models/llama-v3p1-405b-instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (accounts/fireworks/models/llama-v3p2-1b-instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (accounts/fireworks/models/llama-v3p2-3b-instruct)`
+- `meta-llama/Llama-3.2-11B-Vision-Instruct (accounts/fireworks/models/llama-v3p2-11b-vision-instruct)`
+- `meta-llama/Llama-3.2-90B-Vision-Instruct (accounts/fireworks/models/llama-v3p2-90b-vision-instruct)`
+- `meta-llama/Llama-3.3-70B-Instruct (accounts/fireworks/models/llama-v3p3-70b-instruct)`
+- `meta-llama/Llama-Guard-3-8B (accounts/fireworks/models/llama-guard-3-8b)`
+- `meta-llama/Llama-Guard-3-11B-Vision (accounts/fireworks/models/llama-guard-3-11b-vision)`


 ### Prerequisite: API Keys
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `inline::meta-reference` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
@ -31,7 +31,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
 - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `inline::meta-reference-quantized` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
@ -33,7 +33,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
 - `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)

--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -0,0 +1,60 @@
+# NVIDIA Distribution
+
+The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| inference | `remote::nvidia` |
+| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+| safety | `inline::llama-guard` |
+| telemetry | `inline::meta-reference` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `${env.INFERENCE_MODEL} (None)`
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
+
+
+## Running Llama Stack with NVIDIA
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-nvidia \
+  --yaml-config /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template nvidia --image-type conda
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+```
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -18,18 +18,18 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `OLLAMA_URL`: URL of the Ollama server (default: `http://127.0.0.1:11434`)
 - `INFERENCE_MODEL`: Inference model loaded into the Ollama server (default: `meta-llama/Llama-3.2-3B-Instruct`)
 - `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`)
@ -82,11 +82,15 @@ docker run \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
+  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-ollama \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -14,11 +14,14 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
 | inference | `remote::vllm` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
@ -27,9 +30,9 @@ You can use this distribution if you have GPUs and want to run an independent vL

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`)
+- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100/v1`)
 - `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
 - `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
 - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
@ -107,10 +110,15 @@ If you are using Llama Stack Safety / Shield APIs, use:
 export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
+  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -0,0 +1,75 @@
+---
+orphan: true
+---
+# SambaNova Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-sambanova` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| inference | `remote::sambanova` |
+| safety | `inline::llama-guard` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `meta-llama/Llama-3.1-8B-Instruct (Meta-Llama-3.1-8B-Instruct)`
+- `meta-llama/Llama-3.1-70B-Instruct (Meta-Llama-3.1-70B-Instruct)`
+- `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)`
+- `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)`
+- `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)`
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaBova.ai](https://sambanova.ai/).
+
+
+## Running Llama Stack with SambaNova
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  llamastack/distribution-sambanova \
+  --port $LLAMA_STACK_PORT \
+  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template sambanova --image-type conda
+llama stack run ./run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
+```
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -19,11 +19,11 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::tgi` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.
@ -32,7 +32,7 @@ You can use this distribution if you have GPUs and want to run an independent TG

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
 - `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080}/v1`)
 - `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
@ -102,10 +102,15 @@ docker run \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run-with-safety.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
+  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-tgi \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -18,18 +18,18 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::together` |
-| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 ### Environment Variables

 The following environment variables can be configured:

- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
 - `TOGETHER_API_KEY`: Together.AI API Key (default: ``)

 ### Models
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,33 +1,49 @@
 # Quick Start

-In this guide, we'll through how you can use the Llama Stack client SDK to build a simple RAG agent.
+In this guide, we'll walk through how you can use the Llama Stack (server and client SDK ) to test a simple RAG agent.

-The most critical requirement for running the agent is running inference on the underlying Llama model. Depending on what hardware (GPUs) you have available, you have various options. We will use `Ollama` for this purpose as it is the easiest to get started with and yet robust.
+A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.

-First, let's set up some environment variables that we will use in the rest of the guide. Note that if you open up a new terminal, you will need to set these again.
+In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.

-```bash
-export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
-# ollama names this model differently, and we must use the ollama name when loading the model
-export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
-export LLAMA_STACK_PORT=5001
-```

 ### 1. Start Ollama

 ```bash
-ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
+ollama run llama3.2:3b-instruct-fp16 --keepalive 60m
 ```

 By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.

+NOTE: If you do not have ollama, you can install it from [here](https://ollama.ai/docs/installation).

-### 2. Start the Llama Stack server

-Llama Stack is based on a client-server architecture. It consists of a server which can be configured very flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Memory, Agents, Telemetry, Evals and so forth.
+
+### 2. Pick a client environment
+
+Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
+
+* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
+* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
+
+```{admonition} Note
+:class: tip
+
+The API is **exactly identical** for both clients.
+```
+
+:::{dropdown} Starting up the Llama Stack server
+The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.

 To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image.

+Lets setup some environment variables that we will use in the rest of the guide.
+```bash
+INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+LLAMA_STACK_PORT=8321
+```
+
+You can start the server using the following command:
 ```bash
 docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -37,14 +53,18 @@ docker run -it \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
-
 Configuration for this is available at `distributions/ollama/run.yaml`.

+:::

-### 3. Use the Llama Stack client SDK
+
+:::{dropdown} Installing the Llama Stack client CLI and SDK

 You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using the following command. Note that you must be using Python 3.10 or newer:
 ```bash
+yes | conda create -n stack-client python=3.10
+conda activate stack-client
+
 pip install llama-stack-client
 ```

@ -66,17 +86,35 @@ llama-stack-client \
  inference chat-completion \
  --message "hello, what model are you?"
 ```
+:::

-Here is a simple example to perform chat completions using Python instead of the CLI.
+&nbsp;
+
+### 3. Run inference with Python SDK
+
+Here is a simple example to perform chat completions using the SDK.
 ```python
 import os
-from llama_stack_client import LlamaStackClient

-client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
+def create_http_client():
+    from llama_stack_client import LlamaStackClient
+    return LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
+
+def create_library_client(template="ollama"):
+    from llama_stack import LlamaStackAsLibraryClient
+    client = LlamaStackAsLibraryClient(template)
+    client.initialize()
+    return client
+
+
+client = create_library_client()  # or create_http_client() depending on the environment you picked

 # List available models
 models = client.models.list()
-print(models)
+print("--- Available models: ---")
+for m in models:
+    print(f"- {m.identifier}")
+print()

 response = client.inference.chat_completion(
    model_id=os.environ["INFERENCE_MODEL"],
@ -90,62 +128,78 @@ print(response.completion_message.content)

 ### 4. Your first RAG agent

-Here is an example of a simple RAG agent that uses the Llama Stack client SDK.
+Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agent which can answer questions about TorchTune documentation.

 ```python
-import asyncio
 import os
+from termcolor import cprint

-from llama_stack_client import LlamaStackClient
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types import Attachment
 from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.types import Document

+client = create_library_client()  # or create_http_client() depending on the environment you picked

-async def run_main():
-    urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
-    attachments = [
-        Attachment(
-            content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
-            mime_type="text/plain",
-        )
-        for i, url in enumerate(urls)
-    ]
-
-    client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
-
-    agent_config = AgentConfig(
-        model=os.environ["INFERENCE_MODEL"],
-        instructions="You are a helpful assistant",
-        tools=[{"type": "memory"}],  # enable Memory aka RAG
-        enable_session_persistence=True,
+# Documents to be used for RAG
+urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
+documents = [
+    Document(
+        document_id=f"num-{i}",
+        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
+        mime_type="text/plain",
+        metadata={},
    )
+    for i, url in enumerate(urls)
+]

-    agent = Agent(client, agent_config)
-    session_id = agent.create_session("test-session")
-    user_prompts = [
-        (
-            "I am attaching documentation for Torchtune. Help me answer questions I will ask next.",
-            attachments,
-        ),
-        (
-            "What are the top 5 topics that were explained? Only list succinct bullet points.",
-            None,
-        ),
-    ]
-    for prompt, attachments in user_prompts:
-        response = agent.create_turn(
-            messages=[{"role": "user", "content": prompt}],
-            attachments=attachments,
-            session_id=session_id,
-        )
-        for log in EventLogger().log(response):
-            log.print()
+# Register a vector database
+vector_db_id = "test-vector-db"
+client.vector_dbs.register(
+    vector_db_id=vector_db_id,
+    embedding_model="all-MiniLM-L6-v2",
+    embedding_dimension=384,
+)

+# Insert the documents into the vector database
+client.tool_runtime.rag_tool.insert(
+    documents=documents,
+    vector_db_id=vector_db_id,
+    chunk_size_in_tokens=512,
+)

-if __name__ == "__main__":
-    asyncio.run(run_main())
+agent_config = AgentConfig(
+    model=os.environ["INFERENCE_MODEL"],
+    # Define instructions for the agent ( aka system prompt)
+    instructions="You are a helpful assistant",
+    enable_session_persistence=False,
+    # Define tools available to the agent
+    toolgroups = [
+        {
+          "name": "builtin::rag",
+          "args" : {
+            "vector_db_ids": [vector_db_id],
+          }
+        }
+    ],
+)
+
+rag_agent = Agent(client, agent_config)
+session_id = rag_agent.create_session("test-session")
+
+user_prompts = [
+    "What are the top 5 topics that were explained? Only list succinct bullet points.",
+]
+
+# Run the agent loop by calling the `create_turn` method
+for prompt in user_prompts:
+    cprint(f'User> {prompt}', 'green')
+    response = rag_agent.create_turn(
+        messages=[{"role": "user", "content": prompt}],
+        session_id=session_id,
+    )
+    for log in EventLogger().log(response):
+        log.print()
 ```

 ## Next Steps
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -1,23 +1,34 @@
+```{admonition} News
+:class: tip
+
+Llama Stack 0.1.0 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.0) for more details.
+```
+
 # Llama Stack

-Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.
+
+Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
+
+- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
+- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
+- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
+- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
+- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
+
+We focus on making it easy to build production applications with the Llama model family - from the latest Llama 3.3 to specialized models like Llama Guard for safety.

 ```{image} ../_static/llama-stack.png
 :alt: Llama Stack
 :width: 400px
 ```

-Our goal is to provide pre-packaged implementations which can be operated in a variety of deployment environments: developers start iterating with Desktops or their mobile devices and can seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
-
-```{note}
-The Stack APIs are rapidly improving but still a work-in-progress. We invite feedback as well as direct contributions.
-```
+Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.

 ## Quick Links

 - New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
 - Ready to build? Check out the [Quick Start](getting_started/index) to get started.
- Need specific providers? Browse [Distributions](distributions/index) to see all the options available.
+- Need specific providers? Browse [Distributions](distributions/selection) to see all the options available.
 - Want to contribute? See the [Contributing](contributing/index) guide.

 ## Available SDKs
@ -33,33 +44,52 @@ We have a number of client-side SDKs available for different languages.

 ## Supported Llama Stack Implementations

-A number of "adapters" are available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
+A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
+
+**Inference API**
+|  **Provider** |  **Environments** |
+| :----: | :----: |
+|  Meta Reference  |  Single Node |
+|  Ollama  | Single Node   |
+|  Fireworks  |  Hosted  |
+|  Together  |  Hosted  |
+|  NVIDIA NIM  |  Hosted and Single Node  |
+|  vLLM  | Hosted and Single Node |
+|  TGI  |  Hosted and Single Node  |
+|  AWS Bedrock  |  Hosted  |
+|  Cerebras  |  Hosted  |
+|  Groq  |  Hosted  |
+|  SambaNova  |  Hosted  |
+| PyTorch ExecuTorch | On-device iOS, Android |
+
+**Vector IO API**
+|  **Provider** |  **Environments** |
+| :----: | :----: |
+|  FAISS | Single Node |
+|  Chroma | Hosted and Single Node |
+|  Postgres (PGVector) | Hosted and Single Node |
+|  Weaviate | Hosted |
+
+**Safety API**
+|  **Provider** |  **Environments** |
+| :----: | :----: |
+|  Llama Guard | Depends on Inference Provider |
+|  Prompt Guard | Single Node |
+|  Code Scanner | Single Node |
+|  AWS Bedrock | Hosted |

-|  **API Provider** |  **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
-| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
-|  Meta Reference  |  Single Node | Y  |  Y  |  Y  |  Y  |  Y  |
-|  Cerebras  |  Single Node  |   | Y  |    |    |   |
-|  Fireworks  |  Hosted  | Y  | Y  |  Y  |    |   |
-|  AWS Bedrock  |  Hosted  |    |  Y  |    | Y  | |
-|  Together  |  Hosted  |  Y  |  Y  |   | Y  |  |
-|  Ollama  | Single Node   |    |  Y  |    |   |
-|  TGI  |  Hosted and Single Node  |    |  Y  |    |   |
-|  [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama)  |  Hosted and Single Node  |    |  Y  |    |   |
-| Chroma | Single Node |  |  | Y |  |  |
-| Postgres | Single Node |  |  | Y |  |  |
-| PyTorch ExecuTorch | On-device iOS | Y  | Y  |  |  |
-| PyTorch ExecuTorch | On-device Android |  | Y  |  |  |

 ```{toctree}
 :hidden:
 :maxdepth: 3

+self
 introduction/index
 getting_started/index
 concepts/index
 distributions/index
+distributions/selection
 building_applications/index
-benchmark_evaluations/index
 playground/index
 contributing/index
 references/index
--- a/docs/source/introduction/index.md
+++ b/docs/source/introduction/index.md
@ -19,77 +19,45 @@ Building production AI applications today requires solving multiple challenges:
 - Changing providers requires significant code changes.


-### The Vision: A Universal Stack
-
+### Our Solution: A Universal Stack

 ```{image} ../../_static/llama-stack.png
 :alt: Llama Stack
 :width: 400px
 ```

-Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. These building blocks are presented as interoperable APIs with a broad set of Service Providers providing their implementations.
+Llama Stack addresses these challenges through a service-oriented, API-first approach:

-#### Service-oriented Design
-Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from local to remote deployments but also forces the design to be more declarative. This restriction can result in a much simpler, robust developer experience. The same code works across different environments:
+**Develop Anywhere, Deploy Everywhere**
+- Start locally with CPU-only setups
+- Move to GPU acceleration when needed
+- Deploy to cloud or edge without code changes
+- Same APIs and developer experience everywhere

- Local development with CPU-only setups
- Self-hosted with GPU acceleration
- Cloud-hosted on providers like AWS, Fireworks, Together
- On-device for iOS and Android
-
-
-#### Composability
-The APIs we design are composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
-
-#### Turnkey Solutions
-
-We provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or in a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations, or fine-tuning services in minutes.
-
-We have built-in support for critical needs:
-
- Safety guardrails and content filtering
- Comprehensive evaluation capabilities
+**Production-Ready Building Blocks**
+- Pre-built safety guardrails and content filtering
+- Built-in RAG and agent capabilities
+- Comprehensive evaluation toolkit
 - Full observability and monitoring
- Provider federation and fallback

-#### Focus on Llama Models
-As a Meta-initiated project, we explicitly focus on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best.
+**True Provider Independence**
+- Swap providers without application changes
+- Mix and match best-in-class implementations
+- Federation and fallback support
+- No vendor lock-in

-#### Supporting the Ecosystem
-There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem.
-
-Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
-
-#### Rich Provider Ecosystem
-
-```{list-table}
-:header-rows: 1
-
-* - Provider
-  - Local
-  - Self-hosted
-  - Cloud
-* - Inference
-  - Ollama
-  - vLLM, TGI
-  - Fireworks, Together, AWS
-* - Memory
-  - FAISS
-  - Chroma, pgvector
-  - Weaviate
-* - Safety
-  - Llama Guard
-  - -
-  - AWS Bedrock
-```
+**Robust Ecosystem**
+-Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies).
+-Ecosystem offers tailored infrastructure, software, and services for deploying Llama models.


-### Unified API Layer
+### Our Philosophy

-Llama Stack provides a consistent interface for:
+- **Service-Oriented**: REST APIs enforce clean interfaces and enable seamless transitions across different environments.
+- **Composability**: Every component is independent but works together seamlessly
+- **Production Ready**: Built for real-world applications, not just demos
+- **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios
+- **Llama First**: Explicit focus on Meta's Llama models and partnering ecosystem

- **Inference**: Run LLM models efficiently
- **Safety**: Apply content filtering and safety policies
- **Memory**: Store and retrieve knowledge for RAG
- **Agents**: Build multi-step workflows
- **Evaluation**: Test and improve application quality
+
+With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@ -92,9 +92,10 @@ response = client.eval.evaluate_rows(
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
            "sampling_params": {
-                "temperature": 0.0,
+                "strategy": {
+                    "type": "greedy",
+                },
                "max_tokens": 4096,
-                "top_p": 0.9,
                "repeat_penalty": 1.0,
            },
            "system_message": system_message
@ -149,9 +150,10 @@ response = client.eval.evaluate_rows(
            "type": "model",
            "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
            "sampling_params": {
-                "temperature": 0.0,
+                "strategy": {
+                    "type": "greedy",
+                },
                "max_tokens": 4096,
-                "top_p": 0.9,
                "repeat_penalty": 1.0,
            },
        }
@ -170,9 +172,9 @@ agent_config = {
    "model": "meta-llama/Llama-3.1-405B-Instruct",
    "instructions": "You are a helpful assistant",
    "sampling_params": {
-        "strategy": "greedy",
-        "temperature": 0.0,
-        "top_p": 0.95,
+        "strategy": {
+            "type": "greedy",
+        },
    },
    "tools": [
        {
@ -318,10 +320,9 @@ The `EvalTaskConfig` are user specified config to define:
        "type": "model",
        "model": "Llama3.2-3B-Instruct",
        "sampling_params": {
-            "strategy": "greedy",
-            "temperature": 0,
-            "top_p": 0.95,
-            "top_k": 0,
+            "strategy": {
+                "type": "greedy",
+            },
            "max_tokens": 0,
            "repetition_penalty": 1.0
        }
@ -337,10 +338,9 @@ The `EvalTaskConfig` are user specified config to define:
        "type": "model",
        "model": "Llama3.1-405B-Instruct",
        "sampling_params": {
-            "strategy": "greedy",
-            "temperature": 0,
-            "top_p": 0.95,
-            "top_k": 0,
+            "strategy": {
+                "type": "greedy",
+            },
            "max_tokens": 0,
            "repetition_penalty": 1.0
        }
--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@ -97,20 +97,20 @@ To download models, you can use the llama download command.

 #### Downloading from [Meta](https://llama.meta.com/llama-downloads/)

-Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
+Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/). Note: You need to quote the META_URL

 Download the required checkpoints using the following commands:
 ```bash
 # download the 8B model, this can be run on a single GPU
-llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url META_URL
+llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url 'META_URL'

 # you can also get the 70B model, this will require 8 GPUs however
-llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url META_URL
+llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url 'META_URL'

 # llama-agents have safety enabled by default. For this, you will need
 # safety models -- Llama-Guard and Prompt-Guard
-llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
-llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
+llama download --source meta --model-id Prompt-Guard-86M --meta-url 'META_URL'
+llama download --source meta --model-id Llama-Guard-3-1B --meta-url 'META_URL'
 ```

 #### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -214,7 +214,6 @@ llama model describe -m Llama3.2-3B-Instruct
 |                             | }                                |
 +-----------------------------+----------------------------------+
 | Recommended sampling params | {                                |
-|                             |     "strategy": "top_p",         |
 |                             |     "temperature": 1.0,          |
 |                             |     "top_p": 0.9,                |
 |                             |     "top_k": 0                   |
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -23,8 +23,8 @@ subcommands:
 ```bash
 $ llama-stack-client configure
 > Enter the host name of the Llama Stack distribution server: localhost
-> Enter the port number of the Llama Stack distribution server: 5000
-Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
+> Enter the port number of the Llama Stack distribution server: 8321
+Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
 ```

 ### `llama-stack-client providers list`
@ -103,36 +103,35 @@ $ llama-stack-client models update <model_id> [--provider-id <provider_id>] [--p
 $ llama-stack-client models delete <model_id>
 ```

-## Memory Bank Management
+## Vector DB Management

-### `llama-stack-client memory_banks list`
+### `llama-stack-client vector_dbs list`
 ```bash
-$ llama-stack-client memory_banks list
+$ llama-stack-client vector_dbs list
 ```
 ```
-+--------------+----------------+--------+-------------------+------------------------+--------------------------+
-| identifier   | provider_id    | type   | embedding_model   |   chunk_size_in_tokens |   overlap_size_in_tokens |
-+==============+================+========+===================+========================+==========================+
-| test_bank    | meta-reference | vector | all-MiniLM-L6-v2  |                    512 |                       64 |
-+--------------+----------------+--------+-------------------+------------------------+--------------------------+
+--------------+----------------+---------------------+---------------+------------------------+
+| identifier   | provider_id    | provider_resource_id| vector_db_type| params                |
+==============+================+=====================+===============+========================+
+| test_bank    | meta-reference | test_bank          | vector        | embedding_model: all-MiniLM-L6-v2
+                                                                      embedding_dimension: 384|
+--------------+----------------+---------------------+---------------+------------------------+
 ```

-### `llama-stack-client memory_banks register`
+### `llama-stack-client vector_dbs register`
 ```bash
-$ llama-stack-client memory_banks register <memory-bank-id> --type <type> [--provider-id <provider-id>] [--provider-memory-bank-id <provider-memory-bank-id>] [--chunk-size <chunk-size>] [--embedding-model <embedding-model>] [--overlap-size <overlap-size>]
+$ llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
 ```

 Options:
- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph"
- `--provider-id`: Optional. Provider ID for the memory bank
- `--provider-memory-bank-id`: Optional. Provider's memory bank ID
- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512
- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2"
- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64
+- `--provider-id`: Optional. Provider ID for the vector db
+- `--provider-vector-db-id`: Optional. Provider's vector db ID
+- `--embedding-model`: Optional. Embedding model to use. Default: "all-MiniLM-L6-v2"
+- `--embedding-dimension`: Optional. Dimension of embeddings. Default: 384

-### `llama-stack-client memory_banks unregister`
+### `llama-stack-client vector_dbs unregister`
 ```bash
-$ llama-stack-client memory_banks unregister <memory-bank-id>
+$ llama-stack-client vector_dbs unregister <vector-db-id>
 ```

 ## Shield Management
@ -201,11 +200,6 @@ Example eval_task_config.json:
        "model": "Llama3.1-405B-Instruct",
        "sampling_params": {
            "strategy": "greedy",
-            "temperature": 0,
-            "top_p": 0.95,
-            "top_k": 0,
-            "max_tokens": 0,
-            "repetition_penalty": 1.0
        }
    }
 }
@ -221,3 +215,44 @@ Options:
 - `--output-dir`: Required. Path to the directory where scoring results will be saved
 - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
 - `--visualize`: Optional flag. If set, visualizes scoring results after completion
+
+## Tool Group Management
+
+### `llama-stack-client toolgroups list`
+```bash
+$ llama-stack-client toolgroups list
+```
+```
+---------------------------+------------------+------+---------------+
+| identifier                | provider_id      | args | mcp_endpoint  |
+===========================+==================+======+===============+
+| builtin::code_interpreter | code-interpreter | None | None         |
+---------------------------+------------------+------+---------------+
+| builtin::rag             | rag-runtime      | None | None         |
+---------------------------+------------------+------+---------------+
+| builtin::websearch       | tavily-search    | None | None         |
+---------------------------+------------------+------+---------------+
+```
+
+### `llama-stack-client toolgroups get`
+```bash
+$ llama-stack-client toolgroups get <toolgroup_id>
+```
+
+Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.
+
+### `llama-stack-client toolgroups register`
+```bash
+$ llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
+```
+
+Options:
+- `--provider-id`: Optional. Provider ID for the toolgroup
+- `--provider-toolgroup-id`: Optional. Provider's toolgroup ID
+- `--mcp-config`: Optional. JSON configuration for the MCP endpoint
+- `--args`: Optional. JSON arguments for the toolgroup
+
+### `llama-stack-client toolgroups unregister`
+```bash
+$ llama-stack-client toolgroups unregister <toolgroup_id>
+```
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@ -4,29 +4,77 @@

 ```python
 from llama_stack_client.types import (
-    Attachment,
+    AgentConfig,
    BatchCompletion,
    CompletionMessage,
+    ContentDelta,
+    Document,
+    InterleavedContent,
+    InterleavedContentItem,
+    Message,
+    ParamType,
+    QueryConfig,
+    QueryResult,
+    ReturnType,
+    SafetyViolation,
    SamplingParams,
+    ScoringResult,
    SystemMessage,
    ToolCall,
+    ToolParamDefinition,
    ToolResponseMessage,
+    URL,
    UserMessage,
 )
 ```

-## Telemetry
+## Toolgroups

 Types:

 ```python
-from llama_stack_client.types import TelemetryGetTraceResponse
+from llama_stack_client.types import ListToolGroupsResponse, ToolGroup, ToolgroupListResponse
 ```

 Methods:

- <code title="get /telemetry/get_trace">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_trace</a>(\*\*<a href="src/llama_stack_client/types/telemetry_get_trace_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_get_trace_response.py">TelemetryGetTraceResponse</a></code>
- <code title="post /telemetry/log_event">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">log</a>(\*\*<a href="src/llama_stack_client/types/telemetry_log_params.py">params</a>) -> None</code>
+- <code title="get /v1/toolgroups">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">list</a>() -> <a href="./src/llama_stack_client/types/toolgroup_list_response.py">ToolgroupListResponse</a></code>
+- <code title="get /v1/toolgroups/{toolgroup_id}">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">get</a>(toolgroup_id) -> <a href="./src/llama_stack_client/types/tool_group.py">ToolGroup</a></code>
+- <code title="post /v1/toolgroups">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">register</a>(\*\*<a href="src/llama_stack_client/types/toolgroup_register_params.py">params</a>) -> None</code>
+- <code title="delete /v1/toolgroups/{toolgroup_id}">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">unregister</a>(toolgroup_id) -> None</code>
+
+## Tools
+
+Types:
+
+```python
+from llama_stack_client.types import ListToolsResponse, Tool, ToolListResponse
+```
+
+Methods:
+
+- <code title="get /v1/tools">client.tools.<a href="./src/llama_stack_client/resources/tools.py">list</a>(\*\*<a href="src/llama_stack_client/types/tool_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_list_response.py">ToolListResponse</a></code>
+- <code title="get /v1/tools/{tool_name}">client.tools.<a href="./src/llama_stack_client/resources/tools.py">get</a>(tool_name) -> <a href="./src/llama_stack_client/types/tool.py">Tool</a></code>
+
+## ToolRuntime
+
+Types:
+
+```python
+from llama_stack_client.types import ToolDef, ToolInvocationResult
+```
+
+Methods:
+
+- <code title="post /v1/tool-runtime/invoke">client.tool_runtime.<a href="./src/llama_stack_client/resources/tool_runtime/tool_runtime.py">invoke_tool</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime_invoke_tool_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_invocation_result.py">ToolInvocationResult</a></code>
+- <code title="get /v1/tool-runtime/list-tools">client.tool_runtime.<a href="./src/llama_stack_client/resources/tool_runtime/tool_runtime.py">list_tools</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime_list_tools_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_def.py">JSONLDecoder[ToolDef]</a></code>
+
+### RagTool
+
+Methods:
+
+- <code title="post /v1/tool-runtime/rag-tool/insert">client.tool_runtime.rag_tool.<a href="./src/llama_stack_client/resources/tool_runtime/rag_tool.py">insert</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime/rag_tool_insert_params.py">params</a>) -> None</code>
+- <code title="post /v1/tool-runtime/rag-tool/query">client.tool_runtime.rag_tool.<a href="./src/llama_stack_client/resources/tool_runtime/rag_tool.py">query</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime/rag_tool_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/query_result.py">QueryResult</a></code>

 ## Agents

@ -36,20 +84,19 @@ Types:
 from llama_stack_client.types import (
    InferenceStep,
    MemoryRetrievalStep,
-    RestAPIExecutionConfig,
    ShieldCallStep,
    ToolExecutionStep,
-    ToolParamDefinition,
+    ToolResponse,
    AgentCreateResponse,
 )
 ```

 Methods:

- <code title="post /agents/create">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">create</a>(\*\*<a href="src/llama_stack_client/types/agent_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agent_create_response.py">AgentCreateResponse</a></code>
- <code title="post /agents/delete">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">delete</a>(\*\*<a href="src/llama_stack_client/types/agent_delete_params.py">params</a>) -> None</code>
+- <code title="post /v1/agents">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">create</a>(\*\*<a href="src/llama_stack_client/types/agent_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agent_create_response.py">AgentCreateResponse</a></code>
+- <code title="delete /v1/agents/{agent_id}">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">delete</a>(agent_id) -> None</code>

-### Sessions
+### Session

 Types:

@ -59,104 +106,106 @@ from llama_stack_client.types.agents import Session, SessionCreateResponse

 Methods:

- <code title="post /agents/session/create">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">create</a>(\*\*<a href="src/llama_stack_client/types/agents/session_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session_create_response.py">SessionCreateResponse</a></code>
- <code title="post /agents/session/get">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/session_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session.py">Session</a></code>
- <code title="post /agents/session/delete">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">delete</a>(\*\*<a href="src/llama_stack_client/types/agents/session_delete_params.py">params</a>) -> None</code>
+- <code title="post /v1/agents/{agent_id}/session">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">create</a>(agent_id, \*\*<a href="src/llama_stack_client/types/agents/session_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session_create_response.py">SessionCreateResponse</a></code>
+- <code title="get /v1/agents/{agent_id}/session/{session_id}">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">retrieve</a>(session_id, \*, agent_id, \*\*<a href="src/llama_stack_client/types/agents/session_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session.py">Session</a></code>
+- <code title="delete /v1/agents/{agent_id}/session/{session_id}">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">delete</a>(session_id, \*, agent_id) -> None</code>

 ### Steps

 Types:

 ```python
-from llama_stack_client.types.agents import AgentsStep
+from llama_stack_client.types.agents import StepRetrieveResponse
 ```

 Methods:

- <code title="get /agents/step/get">client.agents.steps.<a href="./src/llama_stack_client/resources/agents/steps.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/step_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/agents_step.py">AgentsStep</a></code>
+- <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}">client.agents.steps.<a href="./src/llama_stack_client/resources/agents/steps.py">retrieve</a>(step_id, \*, agent_id, session_id, turn_id) -> <a href="./src/llama_stack_client/types/agents/step_retrieve_response.py">StepRetrieveResponse</a></code>

-### Turns
+### Turn

 Types:

 ```python
-from llama_stack_client.types.agents import AgentsTurnStreamChunk, Turn, TurnStreamEvent
+from llama_stack_client.types.agents import Turn, TurnCreateResponse
 ```

 Methods:

- <code title="post /agents/turn/create">client.agents.turns.<a href="./src/llama_stack_client/resources/agents/turns.py">create</a>(\*\*<a href="src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/agents_turn_stream_chunk.py">AgentsTurnStreamChunk</a></code>
- <code title="get /agents/turn/get">client.agents.turns.<a href="./src/llama_stack_client/resources/agents/turns.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/turn_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/turn.py">Turn</a></code>
+- <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="./src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
+- <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="./src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="./src/llama_stack_client/types/agents/turn.py">Turn</a></code>
+
+## BatchInference
+
+Types:
+
+```python
+from llama_stack_client.types import BatchInferenceChatCompletionResponse
+```
+
+Methods:
+
+- <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
+- <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>

 ## Datasets

 Types:

 ```python
-from llama_stack_client.types import TrainEvalDataset
+from llama_stack_client.types import (
+    ListDatasetsResponse,
+    DatasetRetrieveResponse,
+    DatasetListResponse,
+)
 ```

 Methods:

- <code title="post /datasets/create">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">create</a>(\*\*<a href="src/llama_stack_client/types/dataset_create_params.py">params</a>) -> None</code>
- <code title="post /datasets/delete">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">delete</a>(\*\*<a href="src/llama_stack_client/types/dataset_delete_params.py">params</a>) -> None</code>
- <code title="get /datasets/get">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">get</a>(\*\*<a href="src/llama_stack_client/types/dataset_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/train_eval_dataset.py">TrainEvalDataset</a></code>
+- <code title="get /v1/datasets/{dataset_id}">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">retrieve</a>(dataset_id) -> <a href="./src/llama_stack_client/types/dataset_retrieve_response.py">Optional[DatasetRetrieveResponse]</a></code>
+- <code title="get /v1/datasets">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">list</a>() -> <a href="./src/llama_stack_client/types/dataset_list_response.py">DatasetListResponse</a></code>
+- <code title="post /v1/datasets">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">register</a>(\*\*<a href="src/llama_stack_client/types/dataset_register_params.py">params</a>) -> None</code>
+- <code title="delete /v1/datasets/{dataset_id}">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">unregister</a>(dataset_id) -> None</code>

-## Evaluate
+## Eval

 Types:

 ```python
-from llama_stack_client.types import EvaluationJob
+from llama_stack_client.types import EvaluateResponse, Job
 ```

+Methods:
+
+- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+
 ### Jobs

 Types:

 ```python
-from llama_stack_client.types.evaluate import (
-    EvaluationJobArtifacts,
-    EvaluationJobLogStream,
-    EvaluationJobStatus,
-)
+from llama_stack_client.types.eval import JobStatusResponse
 ```

 Methods:

- <code title="get /evaluate/jobs">client.evaluate.jobs.<a href="./src/llama_stack_client/resources/evaluate/jobs/jobs.py">list</a>() -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
- <code title="post /evaluate/job/cancel">client.evaluate.jobs.<a href="./src/llama_stack_client/resources/evaluate/jobs/jobs.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/evaluate/job_cancel_params.py">params</a>) -> None</code>
+- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
+- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>

-#### Artifacts
+## Inspect
+
+Types:
+
+```python
+from llama_stack_client.types import HealthInfo, ProviderInfo, RouteInfo, VersionInfo
+```

 Methods:

- <code title="get /evaluate/job/artifacts">client.evaluate.jobs.artifacts.<a href="./src/llama_stack_client/resources/evaluate/jobs/artifacts.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/artifact_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_artifacts.py">EvaluationJobArtifacts</a></code>
-
-#### Logs
-
-Methods:
-
- <code title="get /evaluate/job/logs">client.evaluate.jobs.logs.<a href="./src/llama_stack_client/resources/evaluate/jobs/logs.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/log_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_log_stream.py">EvaluationJobLogStream</a></code>
-
-#### Status
-
-Methods:
-
- <code title="get /evaluate/job/status">client.evaluate.jobs.status.<a href="./src/llama_stack_client/resources/evaluate/jobs/status.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/status_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_status.py">EvaluationJobStatus</a></code>
-
-### QuestionAnswering
-
-Methods:
-
- <code title="post /evaluate/question_answering/">client.evaluate.question_answering.<a href="./src/llama_stack_client/resources/evaluate/question_answering.py">create</a>(\*\*<a href="src/llama_stack_client/types/evaluate/question_answering_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
-
-## Evaluations
-
-Methods:
-
- <code title="post /evaluate/summarization/">client.evaluations.<a href="./src/llama_stack_client/resources/evaluations.py">summarization</a>(\*\*<a href="src/llama_stack_client/types/evaluation_summarization_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
- <code title="post /evaluate/text_generation/">client.evaluations.<a href="./src/llama_stack_client/resources/evaluations.py">text_generation</a>(\*\*<a href="src/llama_stack_client/types/evaluation_text_generation_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
+- <code title="get /v1/health">client.inspect.<a href="./src/llama_stack_client/resources/inspect.py">health</a>() -> <a href="./src/llama_stack_client/types/health_info.py">HealthInfo</a></code>
+- <code title="get /v1/version">client.inspect.<a href="./src/llama_stack_client/resources/inspect.py">version</a>() -> <a href="./src/llama_stack_client/types/version_info.py">VersionInfo</a></code>

 ## Inference

@ -164,8 +213,8 @@ Types:

 ```python
 from llama_stack_client.types import (
-    ChatCompletionStreamChunk,
-    CompletionStreamChunk,
+    CompletionResponse,
+    EmbeddingsResponse,
    TokenLogProbs,
    InferenceChatCompletionResponse,
    InferenceCompletionResponse,
@ -174,175 +223,232 @@ from llama_stack_client.types import (

 Methods:

- <code title="post /inference/chat_completion">client.inference.<a href="./src/llama_stack_client/resources/inference/inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /inference/completion">client.inference.<a href="./src/llama_stack_client/resources/inference/inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
+- <code title="post /v1/inference/chat-completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
+- <code title="post /v1/inference/completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
+- <code title="post /v1/inference/embeddings">client.inference.<a href="./src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="./src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>

-### Embeddings
+## VectorIo

 Types:

 ```python
-from llama_stack_client.types.inference import Embeddings
+from llama_stack_client.types import QueryChunksResponse
 ```

 Methods:

- <code title="post /inference/embeddings">client.inference.embeddings.<a href="./src/llama_stack_client/resources/inference/embeddings.py">create</a>(\*\*<a href="src/llama_stack_client/types/inference/embedding_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference/embeddings.py">Embeddings</a></code>
+- <code title="post /v1/vector-io/insert">client.vector_io.<a href="./src/llama_stack_client/resources/vector_io.py">insert</a>(\*\*<a href="src/llama_stack_client/types/vector_io_insert_params.py">params</a>) -> None</code>
+- <code title="post /v1/vector-io/query">client.vector_io.<a href="./src/llama_stack_client/resources/vector_io.py">query</a>(\*\*<a href="src/llama_stack_client/types/vector_io_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/query_chunks_response.py">QueryChunksResponse</a></code>

-## Safety
-
-Types:
-
-```python
-from llama_stack_client.types import RunSheidResponse
-```
-
-Methods:
-
- <code title="post /safety/run_shield">client.safety.<a href="./src/llama_stack_client/resources/safety.py">run_shield</a>(\*\*<a href="src/llama_stack_client/types/safety_run_shield_params.py">params</a>) -> <a href="./src/llama_stack_client/types/run_sheid_response.py">RunSheidResponse</a></code>
-
-## Memory
+## VectorDBs

 Types:

 ```python
 from llama_stack_client.types import (
-    QueryDocuments,
-    MemoryCreateResponse,
-    MemoryRetrieveResponse,
-    MemoryListResponse,
-    MemoryDropResponse,
+    ListVectorDBsResponse,
+    VectorDBRetrieveResponse,
+    VectorDBListResponse,
+    VectorDBRegisterResponse,
 )
 ```

 Methods:

- <code title="post /memory/create">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">create</a>(\*\*<a href="src/llama_stack_client/types/memory_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_create_response.py">object</a></code>
- <code title="get /memory/get">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/memory_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_retrieve_response.py">object</a></code>
- <code title="post /memory/update">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">update</a>(\*\*<a href="src/llama_stack_client/types/memory_update_params.py">params</a>) -> None</code>
- <code title="get /memory/list">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">list</a>() -> <a href="./src/llama_stack_client/types/memory_list_response.py">object</a></code>
- <code title="post /memory/drop">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">drop</a>(\*\*<a href="src/llama_stack_client/types/memory_drop_params.py">params</a>) -> str</code>
- <code title="post /memory/insert">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">insert</a>(\*\*<a href="src/llama_stack_client/types/memory_insert_params.py">params</a>) -> None</code>
- <code title="post /memory/query">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">query</a>(\*\*<a href="src/llama_stack_client/types/memory_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/query_documents.py">QueryDocuments</a></code>
-
-### Documents
-
-Types:
-
-```python
-from llama_stack_client.types.memory import DocumentRetrieveResponse
-```
-
-Methods:
-
- <code title="post /memory/documents/get">client.memory.documents.<a href="./src/llama_stack_client/resources/memory/documents.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/memory/document_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory/document_retrieve_response.py">DocumentRetrieveResponse</a></code>
- <code title="post /memory/documents/delete">client.memory.documents.<a href="./src/llama_stack_client/resources/memory/documents.py">delete</a>(\*\*<a href="src/llama_stack_client/types/memory/document_delete_params.py">params</a>) -> None</code>
-
-## PostTraining
-
-Types:
-
-```python
-from llama_stack_client.types import PostTrainingJob
-```
-
-Methods:
-
- <code title="post /post_training/preference_optimize">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">preference_optimize</a>(\*\*<a href="src/llama_stack_client/types/post_training_preference_optimize_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
- <code title="post /post_training/supervised_fine_tune">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">supervised_fine_tune</a>(\*\*<a href="src/llama_stack_client/types/post_training_supervised_fine_tune_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
-
-### Jobs
-
-Types:
-
-```python
-from llama_stack_client.types.post_training import (
-    PostTrainingJobArtifacts,
-    PostTrainingJobLogStream,
-    PostTrainingJobStatus,
-)
-```
-
-Methods:
-
- <code title="get /post_training/jobs">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">list</a>() -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
- <code title="get /post_training/job/artifacts">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">artifacts</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_artifacts_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_artifacts.py">PostTrainingJobArtifacts</a></code>
- <code title="post /post_training/job/cancel">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_cancel_params.py">params</a>) -> None</code>
- <code title="get /post_training/job/logs">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">logs</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_logs_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_log_stream.py">PostTrainingJobLogStream</a></code>
- <code title="get /post_training/job/status">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">status</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_status_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_status.py">PostTrainingJobStatus</a></code>
-
-## RewardScoring
-
-Types:
-
-```python
-from llama_stack_client.types import RewardScoring, ScoredDialogGenerations
-```
-
-Methods:
-
- <code title="post /reward_scoring/score">client.reward_scoring.<a href="./src/llama_stack_client/resources/reward_scoring.py">score</a>(\*\*<a href="src/llama_stack_client/types/reward_scoring_score_params.py">params</a>) -> <a href="./src/llama_stack_client/types/reward_scoring.py">RewardScoring</a></code>
-
-## SyntheticDataGeneration
-
-Types:
-
-```python
-from llama_stack_client.types import SyntheticDataGeneration
-```
-
-Methods:
-
- <code title="post /synthetic_data_generation/generate">client.synthetic_data_generation.<a href="./src/llama_stack_client/resources/synthetic_data_generation.py">generate</a>(\*\*<a href="src/llama_stack_client/types/synthetic_data_generation_generate_params.py">params</a>) -> <a href="./src/llama_stack_client/types/synthetic_data_generation.py">SyntheticDataGeneration</a></code>
-
-## BatchInference
-
-Types:
-
-```python
-from llama_stack_client.types import BatchChatCompletion
-```
-
-Methods:
-
- <code title="post /batch_inference/chat_completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/batch_chat_completion.py">BatchChatCompletion</a></code>
- <code title="post /batch_inference/completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
+- <code title="get /v1/vector-dbs/{vector_db_id}">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">retrieve</a>(vector_db_id) -> <a href="./src/llama_stack_client/types/vector_db_retrieve_response.py">Optional[VectorDBRetrieveResponse]</a></code>
+- <code title="get /v1/vector-dbs">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">list</a>() -> <a href="./src/llama_stack_client/types/vector_db_list_response.py">VectorDBListResponse</a></code>
+- <code title="post /v1/vector-dbs">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">register</a>(\*\*<a href="src/llama_stack_client/types/vector_db_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_db_register_response.py">VectorDBRegisterResponse</a></code>
+- <code title="delete /v1/vector-dbs/{vector_db_id}">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">unregister</a>(vector_db_id) -> None</code>

 ## Models

 Types:

 ```python
-from llama_stack_client.types import ModelServingSpec
+from llama_stack_client.types import ListModelsResponse, Model, ModelListResponse
 ```

 Methods:

- <code title="get /models/list">client.models.<a href="./src/llama_stack_client/resources/models.py">list</a>() -> <a href="./src/llama_stack_client/types/model_serving_spec.py">ModelServingSpec</a></code>
- <code title="get /models/get">client.models.<a href="./src/llama_stack_client/resources/models.py">get</a>(\*\*<a href="src/llama_stack_client/types/model_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/model_serving_spec.py">Optional</a></code>
+- <code title="get /v1/models/{model_id}">client.models.<a href="./src/llama_stack_client/resources/models.py">retrieve</a>(model_id) -> <a href="./src/llama_stack_client/types/model.py">Optional[Model]</a></code>
+- <code title="get /v1/models">client.models.<a href="./src/llama_stack_client/resources/models.py">list</a>() -> <a href="./src/llama_stack_client/types/model_list_response.py">ModelListResponse</a></code>
+- <code title="post /v1/models">client.models.<a href="./src/llama_stack_client/resources/models.py">register</a>(\*\*<a href="src/llama_stack_client/types/model_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/model.py">Model</a></code>
+- <code title="delete /v1/models/{model_id}">client.models.<a href="./src/llama_stack_client/resources/models.py">unregister</a>(model_id) -> None</code>

-## MemoryBanks
+## PostTraining

 Types:

 ```python
-from llama_stack_client.types import MemoryBankSpec
+from llama_stack_client.types import ListPostTrainingJobsResponse, PostTrainingJob
 ```

 Methods:

- <code title="get /memory_banks/list">client.memory_banks.<a href="./src/llama_stack_client/resources/memory_banks.py">list</a>() -> <a href="./src/llama_stack_client/types/memory_bank_spec.py">MemoryBankSpec</a></code>
- <code title="get /memory_banks/get">client.memory_banks.<a href="./src/llama_stack_client/resources/memory_banks.py">get</a>(\*\*<a href="src/llama_stack_client/types/memory_bank_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_bank_spec.py">Optional</a></code>
+- <code title="post /v1/post-training/preference-optimize">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">preference_optimize</a>(\*\*<a href="src/llama_stack_client/types/post_training_preference_optimize_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
+- <code title="post /v1/post-training/supervised-fine-tune">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">supervised_fine_tune</a>(\*\*<a href="src/llama_stack_client/types/post_training_supervised_fine_tune_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
+
+### Job
+
+Types:
+
+```python
+from llama_stack_client.types.post_training import (
+    JobListResponse,
+    JobArtifactsResponse,
+    JobStatusResponse,
+)
+```
+
+Methods:
+
+- <code title="get /v1/post-training/jobs">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">list</a>() -> <a href="./src/llama_stack_client/types/post_training/job_list_response.py">JobListResponse</a></code>
+- <code title="get /v1/post-training/job/artifacts">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">artifacts</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_artifacts_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/job_artifacts_response.py">Optional[JobArtifactsResponse]</a></code>
+- <code title="post /v1/post-training/job/cancel">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_cancel_params.py">params</a>) -> None</code>
+- <code title="get /v1/post-training/job/status">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">status</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_status_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/job_status_response.py">Optional[JobStatusResponse]</a></code>
+
+## Providers
+
+Types:
+
+```python
+from llama_stack_client.types import ListProvidersResponse, ProviderListResponse
+```
+
+Methods:
+
+- <code title="get /v1/inspect/providers">client.providers.<a href="./src/llama_stack_client/resources/providers.py">list</a>() -> <a href="./src/llama_stack_client/types/provider_list_response.py">ProviderListResponse</a></code>
+
+## Routes
+
+Types:
+
+```python
+from llama_stack_client.types import ListRoutesResponse, RouteListResponse
+```
+
+Methods:
+
+- <code title="get /v1/inspect/routes">client.routes.<a href="./src/llama_stack_client/resources/routes.py">list</a>() -> <a href="./src/llama_stack_client/types/route_list_response.py">RouteListResponse</a></code>
+
+## Safety
+
+Types:
+
+```python
+from llama_stack_client.types import RunShieldResponse
+```
+
+Methods:
+
+- <code title="post /v1/safety/run-shield">client.safety.<a href="./src/llama_stack_client/resources/safety.py">run_shield</a>(\*\*<a href="src/llama_stack_client/types/safety_run_shield_params.py">params</a>) -> <a href="./src/llama_stack_client/types/run_shield_response.py">RunShieldResponse</a></code>

 ## Shields

 Types:

 ```python
-from llama_stack_client.types import ShieldSpec
+from llama_stack_client.types import ListShieldsResponse, Shield, ShieldListResponse
 ```

 Methods:

- <code title="get /shields/list">client.shields.<a href="./src/llama_stack_client/resources/shields.py">list</a>() -> <a href="./src/llama_stack_client/types/shield_spec.py">ShieldSpec</a></code>
- <code title="get /shields/get">client.shields.<a href="./src/llama_stack_client/resources/shields.py">get</a>(\*\*<a href="src/llama_stack_client/types/shield_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shield_spec.py">Optional</a></code>
+- <code title="get /v1/shields/{identifier}">client.shields.<a href="./src/llama_stack_client/resources/shields.py">retrieve</a>(identifier) -> <a href="./src/llama_stack_client/types/shield.py">Optional[Shield]</a></code>
+- <code title="get /v1/shields">client.shields.<a href="./src/llama_stack_client/resources/shields.py">list</a>() -> <a href="./src/llama_stack_client/types/shield_list_response.py">ShieldListResponse</a></code>
+- <code title="post /v1/shields">client.shields.<a href="./src/llama_stack_client/resources/shields.py">register</a>(\*\*<a href="src/llama_stack_client/types/shield_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shield.py">Shield</a></code>
+
+## SyntheticDataGeneration
+
+Types:
+
+```python
+from llama_stack_client.types import SyntheticDataGenerationResponse
+```
+
+Methods:
+
+- <code title="post /v1/synthetic-data-generation/generate">client.synthetic_data_generation.<a href="./src/llama_stack_client/resources/synthetic_data_generation.py">generate</a>(\*\*<a href="src/llama_stack_client/types/synthetic_data_generation_generate_params.py">params</a>) -> <a href="./src/llama_stack_client/types/synthetic_data_generation_response.py">SyntheticDataGenerationResponse</a></code>
+
+## Telemetry
+
+Types:
+
+```python
+from llama_stack_client.types import (
+    QuerySpansResponse,
+    SpanWithStatus,
+    Trace,
+    TelemetryGetSpanResponse,
+    TelemetryGetSpanTreeResponse,
+    TelemetryQuerySpansResponse,
+    TelemetryQueryTracesResponse,
+)
+```
+
+Methods:
+
+- <code title="get /v1/telemetry/traces/{trace_id}/spans/{span_id}">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_span</a>(span_id, \*, trace_id) -> <a href="./src/llama_stack_client/types/telemetry_get_span_response.py">TelemetryGetSpanResponse</a></code>
+- <code title="get /v1/telemetry/spans/{span_id}/tree">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_span_tree</a>(span_id, \*\*<a href="src/llama_stack_client/types/telemetry_get_span_tree_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_get_span_tree_response.py">TelemetryGetSpanTreeResponse</a></code>
+- <code title="get /v1/telemetry/traces/{trace_id}">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_trace</a>(trace_id) -> <a href="./src/llama_stack_client/types/trace.py">Trace</a></code>
+- <code title="post /v1/telemetry/events">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">log_event</a>(\*\*<a href="src/llama_stack_client/types/telemetry_log_event_params.py">params</a>) -> None</code>
+- <code title="get /v1/telemetry/spans">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">query_spans</a>(\*\*<a href="src/llama_stack_client/types/telemetry_query_spans_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_query_spans_response.py">TelemetryQuerySpansResponse</a></code>
+- <code title="get /v1/telemetry/traces">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">query_traces</a>(\*\*<a href="src/llama_stack_client/types/telemetry_query_traces_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_query_traces_response.py">TelemetryQueryTracesResponse</a></code>
+- <code title="post /v1/telemetry/spans/export">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">save_spans_to_dataset</a>(\*\*<a href="src/llama_stack_client/types/telemetry_save_spans_to_dataset_params.py">params</a>) -> None</code>
+
+## Datasetio
+
+Types:
+
+```python
+from llama_stack_client.types import PaginatedRowsResult
+```
+
+Methods:
+
+- <code title="post /v1/datasetio/rows">client.datasetio.<a href="./src/llama_stack_client/resources/datasetio.py">append_rows</a>(\*\*<a href="src/llama_stack_client/types/datasetio_append_rows_params.py">params</a>) -> None</code>
+- <code title="get /v1/datasetio/rows">client.datasetio.<a href="./src/llama_stack_client/resources/datasetio.py">get_rows_paginated</a>(\*\*<a href="src/llama_stack_client/types/datasetio_get_rows_paginated_params.py">params</a>) -> <a href="./src/llama_stack_client/types/paginated_rows_result.py">PaginatedRowsResult</a></code>
+
+## Scoring
+
+Types:
+
+```python
+from llama_stack_client.types import ScoringScoreResponse, ScoringScoreBatchResponse
+```
+
+Methods:
+
+- <code title="post /v1/scoring/score">client.scoring.<a href="./src/llama_stack_client/resources/scoring.py">score</a>(\*\*<a href="src/llama_stack_client/types/scoring_score_params.py">params</a>) -> <a href="./src/llama_stack_client/types/scoring_score_response.py">ScoringScoreResponse</a></code>
+- <code title="post /v1/scoring/score-batch">client.scoring.<a href="./src/llama_stack_client/resources/scoring.py">score_batch</a>(\*\*<a href="src/llama_stack_client/types/scoring_score_batch_params.py">params</a>) -> <a href="./src/llama_stack_client/types/scoring_score_batch_response.py">ScoringScoreBatchResponse</a></code>
+
+## ScoringFunctions
+
+Types:
+
+```python
+from llama_stack_client.types import (
+    ListScoringFunctionsResponse,
+    ScoringFn,
+    ScoringFunctionListResponse,
+)
+```
+
+Methods:
+
+- <code title="get /v1/scoring-functions/{scoring_fn_id}">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">retrieve</a>(scoring_fn_id) -> <a href="./src/llama_stack_client/types/scoring_fn.py">Optional[ScoringFn]</a></code>
+- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
+- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
+
+## EvalTasks
+
+Types:
+
+```python
+from llama_stack_client.types import EvalTask, ListEvalTasksResponse, EvalTaskListResponse
+```
+
+Methods:
+
+- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
+- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
+- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
--- a/docs/to_situate/developer_cookbook.md
+++ b/docs/to_situate/developer_cookbook.md
@ -1,41 +0,0 @@
-# Llama Stack Developer Cookbook
-
-Based on your developer needs, below are references to guides to help you get started.
-
-### Hosted Llama Stack Endpoint
-* Developer Need: I want to connect to a Llama Stack endpoint to build my applications.
-* Effort: 1min
-* Guide:
-  - Checkout our [DeepLearning course](https://www.deeplearning.ai/short-courses/introducing-multimodal-llama-3-2) on building with Llama Stack apps on pre-hosted Llama Stack endpoint.
-
-
-### Local meta-reference Llama Stack Server
-* Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations.
-* Effort: 5min
-* Guide:
-  - Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
-
-### Llama Stack Server with Remote Providers
-* Developer need: I want a Llama Stack distribution with a remote provider.
-* Effort: 10min
-* Guide
-  - Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) on starting up distributions with remote providers.
-
-
-### On-Device (iOS) Llama Stack
-* Developer Need: I want to use Llama Stack on-Device
-* Effort: 1.5hr
-* Guide:
-  - Please see our [iOS Llama Stack SDK](./ios_sdk.md) implementations
-
-### Assemble your own Llama Stack Distribution
-* Developer Need: I want to assemble my own distribution with API providers to my likings
-* Effort: 30min
-* Guide
-  - Please see our [Building Distribution](./building_distro.md) guide for assembling your own Llama Stack distribution with your choice of API providers.
-
-### Adding a New API Provider
-* Developer Need: I want to add a new API provider to Llama Stack.
-* Effort: 3hr
-* Guide
-  - Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) guide for adding a new API provider.
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -32,8 +32,8 @@
   "outputs": [],
   "source": [
    "HOST = \"localhost\"  # Replace with your host\n",
-    "LOCAL_PORT = 5000        # Replace with your local distro port\n",
-    "CLOUD_PORT = 5001        # Replace with your cloud distro port"
+    "LOCAL_PORT = 8321        # Replace with your local distro port\n",
+    "CLOUD_PORT = 8322        # Replace with your cloud distro port"
   ]
  },
  {
@ -43,7 +43,7 @@
   "source": [
    "#### 2. Set Up Local and Cloud Clients\n",
    "\n",
-    "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:5000` and the cloud distribution running on `http://localhost:5001`.\n"
+    "Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:5001`.\n"
   ]
  },
  {
--- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
+++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb
@ -26,27 +26,28 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import os\n",
-    "import requests\n",
-    "import json\n",
    "import asyncio\n",
-    "import nest_asyncio\n",
+    "import json\n",
+    "import os\n",
    "from typing import Dict, List\n",
+    "\n",
+    "import nest_asyncio\n",
+    "import requests\n",
    "from dotenv import load_dotenv\n",
    "from llama_stack_client import LlamaStackClient\n",
-    "from llama_stack_client.lib.agents.custom_tool import CustomTool\n",
-    "from llama_stack_client.types.shared.tool_response_message import ToolResponseMessage\n",
-    "from llama_stack_client.types import CompletionMessage\n",
    "from llama_stack_client.lib.agents.agent import Agent\n",
+    "from llama_stack_client.lib.agents.custom_tool import CustomTool\n",
    "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+    "from llama_stack_client.types import CompletionMessage\n",
    "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+    "from llama_stack_client.types.shared.tool_response_message import ToolResponseMessage\n",
    "\n",
    "# Allow asyncio to run in Jupyter Notebook\n",
    "nest_asyncio.apply()\n",
    "\n",
-    "HOST='localhost'\n",
-    "PORT=5001\n",
-    "MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
+    "HOST = \"localhost\"\n",
+    "PORT = 5001\n",
+    "MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
   ]
  },
  {
@ -69,7 +70,7 @@
   "outputs": [],
   "source": [
    "load_dotenv()\n",
-    "BRAVE_SEARCH_API_KEY = os.environ['BRAVE_SEARCH_API_KEY']"
+    "BRAVE_SEARCH_API_KEY = os.environ[\"BRAVE_SEARCH_API_KEY\"]\n"
   ]
  },
  {
@ -118,7 +119,7 @@
    "                cleaned = {k: v for k, v in results[idx].items() if k in selected_keys}\n",
    "                clean_response.append(cleaned)\n",
    "\n",
-    "        return {\"query\": query, \"top_k\": clean_response}"
+    "        return {\"query\": query, \"top_k\": clean_response}\n"
   ]
  },
  {
@ -157,25 +158,29 @@
    "        for message in messages:\n",
    "            if isinstance(message, CompletionMessage) and message.tool_calls:\n",
    "                for tool_call in message.tool_calls:\n",
-    "                    if 'query' in tool_call.arguments:\n",
-    "                        query = tool_call.arguments['query']\n",
+    "                    if \"query\" in tool_call.arguments:\n",
+    "                        query = tool_call.arguments[\"query\"]\n",
    "                        call_id = tool_call.call_id\n",
    "\n",
    "        if query:\n",
    "            search_result = await self.run_impl(query)\n",
-    "            return [ToolResponseMessage(\n",
-    "                call_id=call_id,\n",
-    "                role=\"ipython\",\n",
-    "                content=self._format_response_for_agent(search_result),\n",
-    "                tool_name=\"brave_search\"\n",
-    "            )]\n",
+    "            return [\n",
+    "                ToolResponseMessage(\n",
+    "                    call_id=call_id,\n",
+    "                    role=\"ipython\",\n",
+    "                    content=self._format_response_for_agent(search_result),\n",
+    "                    tool_name=\"brave_search\",\n",
+    "                )\n",
+    "            ]\n",
    "\n",
-    "        return [ToolResponseMessage(\n",
-    "            call_id=\"no_call_id\",\n",
-    "            role=\"ipython\",\n",
-    "            content=\"No query provided.\",\n",
-    "            tool_name=\"brave_search\"\n",
-    "        )]\n",
+    "        return [\n",
+    "            ToolResponseMessage(\n",
+    "                call_id=\"no_call_id\",\n",
+    "                role=\"ipython\",\n",
+    "                content=\"No query provided.\",\n",
+    "                tool_name=\"brave_search\",\n",
+    "            )\n",
+    "        ]\n",
    "\n",
    "    def _format_response_for_agent(self, search_result):\n",
    "        parsed_result = json.loads(search_result)\n",
@ -186,7 +191,7 @@
    "                f\"   URL: {result.get('url', 'No URL')}\\n\"\n",
    "                f\"   Description: {result.get('description', 'No Description')}\\n\\n\"\n",
    "            )\n",
-    "        return formatted_result"
+    "        return formatted_result\n"
   ]
  },
  {
@ -209,7 +214,7 @@
    "async def execute_search(query: str):\n",
    "    web_search_tool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n",
    "    result = await web_search_tool.run_impl(query)\n",
-    "    print(\"Search Results:\", result)"
+    "    print(\"Search Results:\", result)\n"
   ]
  },
  {
@ -236,7 +241,7 @@
   ],
   "source": [
    "query = \"Latest developments in quantum computing\"\n",
-    "asyncio.run(execute_search(query))"
+    "asyncio.run(execute_search(query))\n"
   ]
  },
  {
@ -288,19 +293,17 @@
    "\n",
    "    # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n",
    "    webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n",
-    "    \n",
+    "\n",
    "    # Define the agent configuration, including the model and tool setup\n",
    "    agent_config = AgentConfig(\n",
    "        model=MODEL_NAME,\n",
    "        instructions=\"\"\"You are a helpful assistant that responds to user queries with relevant information and cites sources when available.\"\"\",\n",
    "        sampling_params={\n",
-    "            \"strategy\": \"greedy\",\n",
-    "            \"temperature\": 1.0,\n",
-    "            \"top_p\": 0.9,\n",
+    "            \"strategy\": {\n",
+    "                \"type\": \"greedy\",\n",
+    "            },\n",
    "        },\n",
-    "        tools=[\n",
-    "            webSearchTool.get_tool_definition()\n",
-    "        ],\n",
+    "        tools=[webSearchTool.get_tool_definition()],\n",
    "        tool_choice=\"auto\",\n",
    "        tool_prompt_format=\"python_list\",\n",
    "        input_shields=input_shields,\n",
@ -329,8 +332,9 @@
    "    async for log in EventLogger().log(response):\n",
    "        log.print()\n",
    "\n",
+    "\n",
    "# Run the function asynchronously in a Jupyter Notebook cell\n",
-    "await run_main(disable_safety=True)"
+    "await run_main(disable_safety=True)\n"
   ]
  }
 ],
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -50,8 +50,8 @@
   "outputs": [],
   "source": [
    "HOST = \"localhost\"  # Replace with your host\n",
-    "PORT = 5001        # Replace with your port\n",
-    "MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
+    "PORT = 5001  # Replace with your port\n",
+    "MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
   ]
  },
  {
@ -60,10 +60,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from dotenv import load_dotenv\n",
    "import os\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
    "load_dotenv()\n",
-    "BRAVE_SEARCH_API_KEY = os.environ['BRAVE_SEARCH_API_KEY']"
+    "BRAVE_SEARCH_API_KEY = os.environ[\"BRAVE_SEARCH_API_KEY\"]\n"
   ]
  },
  {
@ -104,20 +106,22 @@
   ],
   "source": [
    "import os\n",
+    "\n",
    "from llama_stack_client import LlamaStackClient\n",
    "from llama_stack_client.lib.agents.agent import Agent\n",
    "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
    "from llama_stack_client.types.agent_create_params import AgentConfig\n",
    "\n",
+    "\n",
    "async def agent_example():\n",
    "    client = LlamaStackClient(base_url=f\"http://{HOST}:{PORT}\")\n",
    "    agent_config = AgentConfig(\n",
    "        model=MODEL_NAME,\n",
    "        instructions=\"You are a helpful assistant! If you call builtin tools like brave search, follow the syntax brave_search.call(…)\",\n",
    "        sampling_params={\n",
-    "            \"strategy\": \"greedy\",\n",
-    "            \"temperature\": 1.0,\n",
-    "            \"top_p\": 0.9,\n",
+    "            \"strategy\": {\n",
+    "                \"type\": \"greedy\",\n",
+    "            },\n",
    "        },\n",
    "        tools=[\n",
    "            {\n",
@ -157,7 +161,7 @@
    "            log.print()\n",
    "\n",
    "\n",
-    "await agent_example()"
+    "await agent_example()\n"
   ]
  },
  {
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -89,7 +89,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   ```
   ...
   Build Successful! Next steps:
-   1. Set the environment variables: LLAMASTACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
+   1. Set the environment variables: LLAMA_STACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
   2. `llama stack run /Users/<username>/.llama/distributions/llamastack-ollama/ollama-run.yaml
   ```

@ -157,7 +157,15 @@ curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
    ],
-    "sampling_params": {"temperature": 0.7, "seed": 42, "max_tokens": 512}
+    "sampling_params": {
+      "strategy": {
+         "type": "top_p",
+         "temperatrue": 0.7,
+         "top_p": 0.95,
+      },
+      "seed": 42,
+      "max_tokens": 512
+   }
 }
 EOF
 ```
--- a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb
+++ b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb
@ -83,8 +83,8 @@
   },
   "outputs": [],
   "source": [
-    "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
-    "LLAMA31_8B_INSTRUCT = \"Llama3.1-8B-Instruct\""
+    "LLAMA_STACK_API_TOGETHER_URL = \"https://llama-stack.together.ai\"\n",
+    "LLAMA31_8B_INSTRUCT = \"Llama3.1-8B-Instruct\"\n"
   ]
  },
  {
@ -107,12 +107,13 @@
    "    AgentConfigToolSearchToolDefinition,\n",
    ")\n",
    "\n",
+    "\n",
    "# Helper function to create an agent with tools\n",
    "async def create_tool_agent(\n",
    "    client: LlamaStackClient,\n",
    "    tools: List[Dict],\n",
    "    instructions: str = \"You are a helpful assistant\",\n",
-    "    model: str = LLAMA31_8B_INSTRUCT\n",
+    "    model: str = LLAMA31_8B_INSTRUCT,\n",
    ") -> Agent:\n",
    "    \"\"\"Create an agent with specified tools.\"\"\"\n",
    "    print(\"Using the following model: \", model)\n",
@ -120,9 +121,9 @@
    "        model=model,\n",
    "        instructions=instructions,\n",
    "        sampling_params={\n",
-    "            \"strategy\": \"greedy\",\n",
-    "            \"temperature\": 1.0,\n",
-    "            \"top_p\": 0.9,\n",
+    "            \"strategy\": {\n",
+    "                \"type\": \"greedy\",\n",
+    "            },\n",
    "        },\n",
    "        tools=tools,\n",
    "        tool_choice=\"auto\",\n",
@ -130,7 +131,7 @@
    "        enable_session_persistence=True,\n",
    "    )\n",
    "\n",
-    "    return Agent(client, agent_config)"
+    "    return Agent(client, agent_config)\n"
   ]
  },
  {
@ -172,7 +173,8 @@
   ],
   "source": [
    "# comment this if you don't have a BRAVE_SEARCH_API_KEY\n",
-    "os.environ[\"BRAVE_SEARCH_API_KEY\"] = 'YOUR_BRAVE_SEARCH_API_KEY'\n",
+    "os.environ[\"BRAVE_SEARCH_API_KEY\"] = \"YOUR_BRAVE_SEARCH_API_KEY\"\n",
+    "\n",
    "\n",
    "async def create_search_agent(client: LlamaStackClient) -> Agent:\n",
    "    \"\"\"Create an agent with Brave Search capability.\"\"\"\n",
@ -186,8 +188,8 @@
    "\n",
    "    return await create_tool_agent(\n",
    "        client=client,\n",
-    "        tools=[search_tool], # set this to [] if you don't have a BRAVE_SEARCH_API_KEY\n",
-    "        model = LLAMA31_8B_INSTRUCT,\n",
+    "        tools=[search_tool],  # set this to [] if you don't have a BRAVE_SEARCH_API_KEY\n",
+    "        model=LLAMA31_8B_INSTRUCT,\n",
    "        instructions=\"\"\"\n",
    "        You are a research assistant that can search the web.\n",
    "        Always cite your sources with URLs when providing information.\n",
@ -198,9 +200,10 @@
    "\n",
    "        SOURCES:\n",
    "        - [Source title](URL)\n",
-    "        \"\"\"\n",
+    "        \"\"\",\n",
    "    )\n",
    "\n",
+    "\n",
    "# Example usage\n",
    "async def search_example():\n",
    "    client = LlamaStackClient(base_url=LLAMA_STACK_API_TOGETHER_URL)\n",
@ -212,7 +215,7 @@
    "    # Example queries\n",
    "    queries = [\n",
    "        \"What are the latest developments in quantum computing?\",\n",
-    "        #\"Who won the most recent Super Bowl?\",\n",
+    "        # \"Who won the most recent Super Bowl?\",\n",
    "    ]\n",
    "\n",
    "    for query in queries:\n",
@ -227,8 +230,9 @@
    "        async for log in EventLogger().log(response):\n",
    "            log.print()\n",
    "\n",
+    "\n",
    "# Run the example (in Jupyter, use asyncio.run())\n",
-    "await search_example()"
+    "await search_example()\n"
   ]
  },
  {
@ -286,12 +290,16 @@
    }
   ],
   "source": [
-    "from typing import TypedDict, Optional, Dict, Any\n",
-    "from datetime import datetime\n",
    "import json\n",
-    "from llama_stack_client.types.tool_param_definition_param import ToolParamDefinitionParam\n",
-    "from llama_stack_client.types import CompletionMessage,ToolResponseMessage\n",
+    "from datetime import datetime\n",
+    "from typing import Any, Dict, Optional, TypedDict\n",
+    "\n",
    "from llama_stack_client.lib.agents.custom_tool import CustomTool\n",
+    "from llama_stack_client.types import CompletionMessage, ToolResponseMessage\n",
+    "from llama_stack_client.types.tool_param_definition_param import (\n",
+    "    ToolParamDefinitionParam,\n",
+    ")\n",
+    "\n",
    "\n",
    "class WeatherTool(CustomTool):\n",
    "    \"\"\"Example custom tool for weather information.\"\"\"\n",
@ -305,16 +313,15 @@
    "    def get_params_definition(self) -> Dict[str, ToolParamDefinitionParam]:\n",
    "        return {\n",
    "            \"location\": ToolParamDefinitionParam(\n",
-    "                param_type=\"str\",\n",
-    "                description=\"City or location name\",\n",
-    "                required=True\n",
+    "                param_type=\"str\", description=\"City or location name\", required=True\n",
    "            ),\n",
    "            \"date\": ToolParamDefinitionParam(\n",
    "                param_type=\"str\",\n",
    "                description=\"Optional date (YYYY-MM-DD)\",\n",
-    "                required=False\n",
-    "            )\n",
+    "                required=False,\n",
+    "            ),\n",
    "        }\n",
+    "\n",
    "    async def run(self, messages: List[CompletionMessage]) -> List[ToolResponseMessage]:\n",
    "        assert len(messages) == 1, \"Expected single message\"\n",
    "\n",
@ -337,20 +344,14 @@
    "        )\n",
    "        return [message]\n",
    "\n",
-    "    async def run_impl(self, location: str, date: Optional[str] = None) -> Dict[str, Any]:\n",
+    "    async def run_impl(\n",
+    "        self, location: str, date: Optional[str] = None\n",
+    "    ) -> Dict[str, Any]:\n",
    "        \"\"\"Simulate getting weather data (replace with actual API call).\"\"\"\n",
    "        # Mock implementation\n",
    "        if date:\n",
-    "            return {\n",
-    "            \"temperature\": 90.1,\n",
-    "            \"conditions\": \"sunny\",\n",
-    "            \"humidity\": 40.0\n",
-    "        }\n",
-    "        return {\n",
-    "            \"temperature\": 72.5,\n",
-    "            \"conditions\": \"partly cloudy\",\n",
-    "            \"humidity\": 65.0\n",
-    "        }\n",
+    "            return {\"temperature\": 90.1, \"conditions\": \"sunny\", \"humidity\": 40.0}\n",
+    "        return {\"temperature\": 72.5, \"conditions\": \"partly cloudy\", \"humidity\": 65.0}\n",
    "\n",
    "\n",
    "async def create_weather_agent(client: LlamaStackClient) -> Agent:\n",
@ -358,38 +359,33 @@
    "\n",
    "    # Create the agent with the tool\n",
    "    weather_tool = WeatherTool()\n",
-    "    \n",
+    "\n",
    "    agent_config = AgentConfig(\n",
    "        model=LLAMA31_8B_INSTRUCT,\n",
-    "        #model=model_name,\n",
+    "        # model=model_name,\n",
    "        instructions=\"\"\"\n",
    "        You are a weather assistant that can provide weather information.\n",
    "        Always specify the location clearly in your responses.\n",
    "        Include both temperature and conditions in your summaries.\n",
    "        \"\"\",\n",
    "        sampling_params={\n",
-    "            \"strategy\": \"greedy\",\n",
-    "            \"temperature\": 1.0,\n",
-    "            \"top_p\": 0.9,\n",
+    "            \"strategy\": {\n",
+    "                \"type\": \"greedy\",\n",
+    "            },\n",
    "        },\n",
-    "        tools=[\n",
-    "            weather_tool.get_tool_definition()\n",
-    "        ],\n",
+    "        tools=[weather_tool.get_tool_definition()],\n",
    "        tool_choice=\"auto\",\n",
    "        tool_prompt_format=\"json\",\n",
    "        input_shields=[],\n",
    "        output_shields=[],\n",
-    "        enable_session_persistence=True\n",
+    "        enable_session_persistence=True,\n",
    "    )\n",
    "\n",
-    "    agent = Agent(\n",
-    "        client=client,\n",
-    "        agent_config=agent_config,\n",
-    "        custom_tools=[weather_tool]\n",
-    "    )\n",
+    "    agent = Agent(client=client, agent_config=agent_config, custom_tools=[weather_tool])\n",
    "\n",
    "    return agent\n",
    "\n",
+    "\n",
    "# Example usage\n",
    "async def weather_example():\n",
    "    client = LlamaStackClient(base_url=LLAMA_STACK_API_TOGETHER_URL)\n",
@ -413,12 +409,14 @@
    "        async for log in EventLogger().log(response):\n",
    "            log.print()\n",
    "\n",
+    "\n",
    "# For Jupyter notebooks\n",
    "import nest_asyncio\n",
+    "\n",
    "nest_asyncio.apply()\n",
    "\n",
    "# Run the example\n",
-    "await weather_example()"
+    "await weather_example()\n"
   ]
  },
  {
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -7,6 +7,7 @@
 from datetime import datetime
 from enum import Enum
 from typing import (
+    Annotated,
    Any,
    AsyncIterator,
    Dict,
@ -20,21 +21,18 @@ from typing import (

 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, ConfigDict, Field
-from typing_extensions import Annotated

-from llama_stack.apis.common.content_types import InterleavedContent, URL
+from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, URL
 from llama_stack.apis.inference import (
    CompletionMessage,
    SamplingParams,
    ToolCall,
-    ToolCallDelta,
    ToolChoice,
    ToolPromptFormat,
    ToolResponse,
    ToolResponseMessage,
    UserMessage,
 )
-from llama_stack.apis.memory import MemoryBank
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -90,7 +88,7 @@ class MemoryRetrievalStep(StepCommon):
    step_type: Literal[StepType.memory_retrieval.value] = (
        StepType.memory_retrieval.value
    )
-    memory_bank_ids: List[str]
+    vector_db_ids: str
    inserted_context: InterleavedContent


@ -134,8 +132,6 @@ class Session(BaseModel):
    turns: List[Turn]
    started_at: datetime

-    memory_bank: Optional[MemoryBank] = None
-

 class AgentToolGroupWithArgs(BaseModel):
    name: str
@ -159,9 +155,7 @@ class AgentConfigCommon(BaseModel):
    toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
    client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(
-        default=ToolPromptFormat.json
-    )
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)

    max_infer_iters: int = 10

@ -216,8 +210,7 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
    step_type: StepType
    step_id: str

-    text_delta: Optional[str] = None
-    tool_call_delta: Optional[ToolCallDelta] = None
+    delta: ContentDelta


@json_schema_type
@ -236,11 +229,8 @@ class AgentTurnResponseTurnCompletePayload(BaseModel):
    turn: Turn


-@json_schema_type
-class AgentTurnResponseEvent(BaseModel):
-    """Streamed agent execution response."""
-
-    payload: Annotated[
+AgentTurnResponseEventPayload = register_schema(
+    Annotated[
        Union[
            AgentTurnResponseStepStartPayload,
            AgentTurnResponseStepProgressPayload,
@ -249,7 +239,14 @@ class AgentTurnResponseEvent(BaseModel):
            AgentTurnResponseTurnCompletePayload,
        ],
        Field(discriminator="event_type"),
-    ]
+    ],
+    name="AgentTurnResponseEventPayload",
+)
+
+
+@json_schema_type
+class AgentTurnResponseEvent(BaseModel):
+    payload: AgentTurnResponseEventPayload


@json_schema_type
@ -298,13 +295,13 @@ class AgentStepResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Agents(Protocol):
-    @webmethod(route="/agents/create")
+    @webmethod(route="/agents", method="POST")
    async def create_agent(
        self,
        agent_config: AgentConfig,
    ) -> AgentCreateResponse: ...

-    @webmethod(route="/agents/turn/create")
+    @webmethod(route="/agents/{agent_id}/session/{session_id}/turn", method="POST")
    async def create_agent_turn(
        self,
        agent_id: str,
@ -320,36 +317,52 @@ class Agents(Protocol):
        toolgroups: Optional[List[AgentToolGroup]] = None,
    ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ...

-    @webmethod(route="/agents/turn/get")
+    @webmethod(
+        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET"
+    )
    async def get_agents_turn(
-        self, agent_id: str, session_id: str, turn_id: str
+        self,
+        agent_id: str,
+        session_id: str,
+        turn_id: str,
    ) -> Turn: ...

-    @webmethod(route="/agents/step/get")
+    @webmethod(
+        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
+        method="GET",
+    )
    async def get_agents_step(
-        self, agent_id: str, session_id: str, turn_id: str, step_id: str
+        self,
+        agent_id: str,
+        session_id: str,
+        turn_id: str,
+        step_id: str,
    ) -> AgentStepResponse: ...

-    @webmethod(route="/agents/session/create")
+    @webmethod(route="/agents/{agent_id}/session", method="POST")
    async def create_agent_session(
        self,
        agent_id: str,
        session_name: str,
    ) -> AgentSessionCreateResponse: ...

-    @webmethod(route="/agents/session/get")
+    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET")
    async def get_agents_session(
        self,
-        agent_id: str,
        session_id: str,
+        agent_id: str,
        turn_ids: Optional[List[str]] = None,
    ) -> Session: ...

-    @webmethod(route="/agents/session/delete")
-    async def delete_agents_session(self, agent_id: str, session_id: str) -> None: ...
+    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE")
+    async def delete_agents_session(
+        self,
+        session_id: str,
+        agent_id: str,
+    ) -> None: ...

-    @webmethod(route="/agents/delete")
-    async def delete_agents(
+    @webmethod(route="/agents/{agent_id}", method="DELETE")
+    async def delete_agent(
        self,
        agent_id: str,
    ) -> None: ...
--- a/llama_stack/apis/agents/event_logger.py
+++ b/llama_stack/apis/agents/event_logger.py
@ -11,9 +11,13 @@ from llama_models.llama3.api.tool_utils import ToolUtils
 from termcolor import cprint

 from llama_stack.apis.agents import AgentTurnResponseEventType, StepType
-
+from llama_stack.apis.common.content_types import ToolCallParseStatus
 from llama_stack.apis.inference import ToolResponseMessage

+from llama_stack.providers.utils.inference.prompt_adapter import (
+    interleaved_content_as_str,
+)
+

 class LogEvent:
    def __init__(
@ -57,8 +61,11 @@ class EventLogger:
                # since it does not produce event but instead
                # a Message
                if isinstance(chunk, ToolResponseMessage):
-                    yield chunk, LogEvent(
-                        role="CustomTool", content=chunk.content, color="grey"
+                    yield (
+                        chunk,
+                        LogEvent(
+                            role="CustomTool", content=chunk.content, color="grey"
+                        ),
                    )
                continue

@ -80,14 +87,20 @@ class EventLogger:
            ):
                violation = event.payload.step_details.violation
                if not violation:
-                    yield event, LogEvent(
-                        role=step_type, content="No Violation", color="magenta"
+                    yield (
+                        event,
+                        LogEvent(
+                            role=step_type, content="No Violation", color="magenta"
+                        ),
                    )
                else:
-                    yield event, LogEvent(
-                        role=step_type,
-                        content=f"{violation.metadata} {violation.user_message}",
-                        color="red",
+                    yield (
+                        event,
+                        LogEvent(
+                            role=step_type,
+                            content=f"{violation.metadata} {violation.user_message}",
+                            color="red",
+                        ),
                    )

            # handle inference
@ -95,8 +108,11 @@ class EventLogger:
                if stream:
                    if event_type == EventType.step_start.value:
                        # TODO: Currently this event is never received
-                        yield event, LogEvent(
-                            role=step_type, content="", end="", color="yellow"
+                        yield (
+                            event,
+                            LogEvent(
+                                role=step_type, content="", end="", color="yellow"
+                            ),
                        )
                    elif event_type == EventType.step_progress.value:
                        # HACK: if previous was not step/event was not inference's step_progress
@ -107,24 +123,34 @@ class EventLogger:
                            previous_event_type != EventType.step_progress.value
                            and previous_step_type != StepType.inference
                        ):
-                            yield event, LogEvent(
-                                role=step_type, content="", end="", color="yellow"
+                            yield (
+                                event,
+                                LogEvent(
+                                    role=step_type, content="", end="", color="yellow"
+                                ),
                            )

-                        if event.payload.tool_call_delta:
-                            if isinstance(event.payload.tool_call_delta.content, str):
-                                yield event, LogEvent(
-                                    role=None,
-                                    content=event.payload.tool_call_delta.content,
-                                    end="",
-                                    color="cyan",
+                        delta = event.payload.delta
+                        if delta.type == "tool_call":
+                            if delta.parse_status == ToolCallParseStatus.succeeded:
+                                yield (
+                                    event,
+                                    LogEvent(
+                                        role=None,
+                                        content=delta.tool_call,
+                                        end="",
+                                        color="cyan",
+                                    ),
                                )
                        else:
-                            yield event, LogEvent(
-                                role=None,
-                                content=event.payload.text_delta,
-                                end="",
-                                color="yellow",
+                            yield (
+                                event,
+                                LogEvent(
+                                    role=None,
+                                    content=delta.text,
+                                    end="",
+                                    color="yellow",
+                                ),
                            )
                    else:
                        # step_complete
@ -140,10 +166,13 @@ class EventLogger:
                            )
                        else:
                            content = response.content
-                        yield event, LogEvent(
-                            role=step_type,
-                            content=content,
-                            color="yellow",
+                        yield (
+                            event,
+                            LogEvent(
+                                role=step_type,
+                                content=content,
+                                color="yellow",
+                            ),
                        )

            # handle tool_execution
@ -155,16 +184,22 @@ class EventLogger:
            ):
                details = event.payload.step_details
                for t in details.tool_calls:
-                    yield event, LogEvent(
-                        role=step_type,
-                        content=f"Tool:{t.tool_name} Args:{t.arguments}",
-                        color="green",
+                    yield (
+                        event,
+                        LogEvent(
+                            role=step_type,
+                            content=f"Tool:{t.tool_name} Args:{t.arguments}",
+                            color="green",
+                        ),
                    )
                for r in details.tool_responses:
-                    yield event, LogEvent(
-                        role=step_type,
-                        content=f"Tool:{r.tool_name} Response:{r.content}",
-                        color="green",
+                    yield (
+                        event,
+                        LogEvent(
+                            role=step_type,
+                            content=f"Tool:{r.tool_name} Response:{r.content}",
+                            color="green",
+                        ),
                    )

            if (
@ -172,15 +207,16 @@ class EventLogger:
                and event_type == EventType.step_complete.value
            ):
                details = event.payload.step_details
-                inserted_context = interleaved_text_media_as_str(
-                    details.inserted_context
-                )
-                content = f"fetched {len(inserted_context)} bytes from {details.memory_bank_ids}"
+                inserted_context = interleaved_content_as_str(details.inserted_context)
+                content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"

-                yield event, LogEvent(
-                    role=step_type,
-                    content=content,
-                    color="cyan",
+                yield (
+                    event,
+                    LogEvent(
+                        role=step_type,
+                        content=content,
+                        color="cyan",
+                    ),
                )

            previous_event_type = event_type
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -7,7 +7,6 @@
 from typing import List, Optional, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
-
 from pydantic import BaseModel, Field

 from llama_stack.apis.inference import (
@ -44,9 +43,7 @@ class BatchChatCompletionRequest(BaseModel):
    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(
-        default=ToolPromptFormat.json
-    )
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
    logprobs: Optional[LogProbConfig] = None


@ -57,7 +54,7 @@ class BatchChatCompletionResponse(BaseModel):

@runtime_checkable
 class BatchInference(Protocol):
-    @webmethod(route="/batch-inference/completion")
+    @webmethod(route="/batch-inference/completion", method="POST")
    async def batch_completion(
        self,
        model: str,
@ -66,7 +63,7 @@ class BatchInference(Protocol):
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchCompletionResponse: ...

-    @webmethod(route="/batch-inference/chat-completion")
+    @webmethod(route="/batch-inference/chat-completion", method="POST")
    async def batch_chat_completion(
        self,
        model: str,
@ -75,6 +72,6 @@ class BatchInference(Protocol):
        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = list,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
        logprobs: Optional[LogProbConfig] = None,
    ) -> BatchChatCompletionResponse: ...
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -5,10 +5,12 @@
 # the root directory of this source tree.

 import base64
+from enum import Enum
 from typing import Annotated, List, Literal, Optional, Union

-from llama_models.schema_utils import json_schema_type, register_schema
+from llama_models.llama3.api.datatypes import ToolCall

+from llama_models.schema_utils import json_schema_type, register_schema
 from pydantic import BaseModel, Field, field_serializer, model_validator


@ -36,8 +38,9 @@ class _URLOrData(BaseModel):


@json_schema_type
-class ImageContentItem(_URLOrData):
+class ImageContentItem(BaseModel):
    type: Literal["image"] = "image"
+    image: _URLOrData


@json_schema_type
@ -60,3 +63,44 @@ InterleavedContent = register_schema(
    Union[str, InterleavedContentItem, List[InterleavedContentItem]],
    name="InterleavedContent",
 )
+
+
+@json_schema_type
+class TextDelta(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+
+
+@json_schema_type
+class ImageDelta(BaseModel):
+    type: Literal["image"] = "image"
+    image: bytes
+
+
+@json_schema_type
+class ToolCallParseStatus(Enum):
+    started = "started"
+    in_progress = "in_progress"
+    failed = "failed"
+    succeeded = "succeeded"
+
+
+@json_schema_type
+class ToolCallDelta(BaseModel):
+    type: Literal["tool_call"] = "tool_call"
+
+    # you either send an in-progress tool call so the client can stream a long
+    # code generation or you send the final parsed tool call at the end of the
+    # stream
+    tool_call: Union[str, ToolCall]
+    parse_status: ToolCallParseStatus
+
+
+# streaming completions send a stream of ContentDeltas
+ContentDelta = register_schema(
+    Annotated[
+        Union[TextDelta, ImageDelta, ToolCallDelta],
+        Field(discriminator="type"),
+    ],
+    name="ContentDelta",
+)
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@ -6,54 +6,71 @@

 from typing import Literal, Union

-from llama_models.schema_utils import register_schema
+from llama_models.schema_utils import json_schema_type, register_schema
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated


+@json_schema_type
 class StringType(BaseModel):
    type: Literal["string"] = "string"


+@json_schema_type
 class NumberType(BaseModel):
    type: Literal["number"] = "number"


+@json_schema_type
 class BooleanType(BaseModel):
    type: Literal["boolean"] = "boolean"


+@json_schema_type
 class ArrayType(BaseModel):
    type: Literal["array"] = "array"


+@json_schema_type
 class ObjectType(BaseModel):
    type: Literal["object"] = "object"


+@json_schema_type
 class JsonType(BaseModel):
    type: Literal["json"] = "json"


+@json_schema_type
 class UnionType(BaseModel):
    type: Literal["union"] = "union"


+@json_schema_type
 class ChatCompletionInputType(BaseModel):
    # expects List[Message] for messages
    type: Literal["chat_completion_input"] = "chat_completion_input"


+@json_schema_type
 class CompletionInputType(BaseModel):
    # expects InterleavedTextMedia for content
    type: Literal["completion_input"] = "completion_input"


+@json_schema_type
 class AgentTurnInputType(BaseModel):
    # expects List[Message] for messages (may also include attachments?)
    type: Literal["agent_turn_input"] = "agent_turn_input"


+@json_schema_type
+class DialogType(BaseModel):
+    # expects List[Message] for messages
+    # this type semantically contains the output label whereas ChatCompletionInputType does not
+    type: Literal["dialog"] = "dialog"
+
+
 ParamType = register_schema(
    Annotated[
        Union[
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -29,7 +29,7 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/get-rows-paginated", method="GET")
+    @webmethod(route="/datasetio/rows", method="GET")
    async def get_rows_paginated(
        self,
        dataset_id: str,
@ -38,7 +38,7 @@ class DatasetIO(Protocol):
        filter_condition: Optional[str] = None,
    ) -> PaginatedRowsResult: ...

-    @webmethod(route="/datasetio/append-rows", method="POST")
+    @webmethod(route="/datasetio/rows", method="POST")
    async def append_rows(
        self, dataset_id: str, rows: List[Dict[str, Any]]
    ) -> None: ...
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -7,11 +7,9 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol

 from llama_models.schema_utils import json_schema_type, webmethod
-
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.content_types import URL
-
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType

@ -44,8 +42,12 @@ class DatasetInput(CommonDatasetFields, BaseModel):
    provider_dataset_id: Optional[str] = None


+class ListDatasetsResponse(BaseModel):
+    data: List[Dataset]
+
+
 class Datasets(Protocol):
-    @webmethod(route="/datasets/register", method="POST")
+    @webmethod(route="/datasets", method="POST")
    async def register_dataset(
        self,
        dataset_id: str,
@ -56,16 +58,16 @@ class Datasets(Protocol):
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None: ...

-    @webmethod(route="/datasets/get", method="GET")
+    @webmethod(route="/datasets/{dataset_id}", method="GET")
    async def get_dataset(
        self,
        dataset_id: str,
    ) -> Optional[Dataset]: ...

-    @webmethod(route="/datasets/list", method="GET")
-    async def list_datasets(self) -> List[Dataset]: ...
+    @webmethod(route="/datasets", method="GET")
+    async def list_datasets(self) -> ListDatasetsResponse: ...

-    @webmethod(route="/datasets/unregister", method="POST")
+    @webmethod(route="/datasets/{dataset_id}", method="DELETE")
    async def unregister_dataset(
        self,
        dataset_id: str,
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+
+from llama_models.schema_utils import json_schema_type
+
+
+@json_schema_type
+class Api(Enum):
+    inference = "inference"
+    safety = "safety"
+    agents = "agents"
+    vector_io = "vector_io"
+    datasetio = "datasetio"
+    scoring = "scoring"
+    eval = "eval"
+    post_training = "post_training"
+    tool_runtime = "tool_runtime"
+
+    telemetry = "telemetry"
+
+    models = "models"
+    shields = "shields"
+    vector_dbs = "vector_dbs"
+    datasets = "datasets"
+    scoring_functions = "scoring_functions"
+    eval_tasks = "eval_tasks"
+    tool_groups = "tool_groups"
+
+    # built-in API
+    inspect = "inspect"
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -6,10 +6,8 @@

 from typing import Any, Dict, List, Literal, Optional, Protocol, Union

-from llama_models.schema_utils import json_schema_type, webmethod
-
+from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
-
 from typing_extensions import Annotated

 from llama_stack.apis.agents import AgentConfig
@ -33,9 +31,10 @@ class AgentCandidate(BaseModel):
    config: AgentConfig


-EvalCandidate = Annotated[
-    Union[ModelCandidate, AgentCandidate], Field(discriminator="type")
-]
+EvalCandidate = register_schema(
+    Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
+    name="EvalCandidate",
+)


@json_schema_type
@ -63,9 +62,12 @@ class AppEvalTaskConfig(BaseModel):
    # we could optinally add any specific dataset config here


-EvalTaskConfig = Annotated[
-    Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
-]
+EvalTaskConfig = register_schema(
+    Annotated[
+        Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
+    ],
+    name="EvalTaskConfig",
+)


@json_schema_type
@ -76,14 +78,14 @@ class EvaluateResponse(BaseModel):


 class Eval(Protocol):
-    @webmethod(route="/eval/run-eval", method="POST")
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
    async def run_eval(
        self,
        task_id: str,
        task_config: EvalTaskConfig,
    ) -> Job: ...

-    @webmethod(route="/eval/evaluate-rows", method="POST")
+    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
    async def evaluate_rows(
        self,
        task_id: str,
@ -92,11 +94,11 @@ class Eval(Protocol):
        task_config: EvalTaskConfig,
    ) -> EvaluateResponse: ...

-    @webmethod(route="/eval/job/status", method="GET")
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...

-    @webmethod(route="/eval/job/cancel", method="POST")
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
    async def job_cancel(self, task_id: str, job_id: str) -> None: ...

-    @webmethod(route="/eval/job/result", method="GET")
-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@ -6,7 +6,6 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
-
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
@ -40,15 +39,22 @@ class EvalTaskInput(CommonEvalTaskFields, BaseModel):
    provider_eval_task_id: Optional[str] = None


+class ListEvalTasksResponse(BaseModel):
+    data: List[EvalTask]
+
+
@runtime_checkable
 class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks/list", method="GET")
-    async def list_eval_tasks(self) -> List[EvalTask]: ...
+    @webmethod(route="/eval-tasks", method="GET")
+    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...

-    @webmethod(route="/eval-tasks/get", method="GET")
-    async def get_eval_task(self, name: str) -> Optional[EvalTask]: ...
+    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
+    async def get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[EvalTask]: ...

-    @webmethod(route="/eval-tasks/register", method="POST")
+    @webmethod(route="/eval-tasks", method="POST")
    async def register_eval_task(
        self,
        eval_task_id: str,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.

 from enum import Enum
-
 from typing import (
    Any,
    AsyncIterator,
@ -26,16 +25,12 @@ from llama_models.llama3.api.datatypes import (
    ToolDefinition,
    ToolPromptFormat,
 )
-
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
-
 from pydantic import BaseModel, Field, field_validator
 from typing_extensions import Annotated

-from llama_stack.apis.common.content_types import InterleavedContent
-
+from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.models import Model
-
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol


@ -87,7 +82,7 @@ class SystemMessage(BaseModel):

@json_schema_type
 class ToolResponseMessage(BaseModel):
-    role: Literal["ipython"] = "ipython"
+    role: Literal["tool"] = "tool"
    # it was nice to re-use the ToolResponse type, but having all messages
    # have a `content` type makes things nicer too
    call_id: str
@ -152,35 +147,23 @@ class ChatCompletionResponseEventType(Enum):
    progress = "progress"


-@json_schema_type
-class ToolCallParseStatus(Enum):
-    started = "started"
-    in_progress = "in_progress"
-    failure = "failure"
-    success = "success"
-
-
-@json_schema_type
-class ToolCallDelta(BaseModel):
-    content: Union[str, ToolCall]
-    parse_status: ToolCallParseStatus
-
-
@json_schema_type
 class ChatCompletionResponseEvent(BaseModel):
    """Chat completion response event."""

    event_type: ChatCompletionResponseEventType
-    delta: Union[str, ToolCallDelta]
+    delta: ContentDelta
    logprobs: Optional[List[TokenLogProbs]] = None
    stop_reason: Optional[StopReason] = None


+@json_schema_type
 class ResponseFormatType(Enum):
    json_schema = "json_schema"
    grammar = "grammar"


+@json_schema_type
 class JsonSchemaResponseFormat(BaseModel):
    type: Literal[ResponseFormatType.json_schema.value] = (
        ResponseFormatType.json_schema.value
@ -188,6 +171,7 @@ class JsonSchemaResponseFormat(BaseModel):
    json_schema: Dict[str, Any]


+@json_schema_type
 class GrammarResponseFormat(BaseModel):
    type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
    bnf: Dict[str, Any]
@ -256,9 +240,7 @@ class ChatCompletionRequest(BaseModel):
    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(
-        default=ToolPromptFormat.json
-    )
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
    response_format: Optional[ResponseFormat] = None

    stream: Optional[bool] = False
@ -289,9 +271,7 @@ class BatchChatCompletionRequest(BaseModel):
    # zero-shot tool definitions as input to the model
    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(
-        default=ToolPromptFormat.json
-    )
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
    logprobs: Optional[LogProbConfig] = None


@ -314,7 +294,7 @@ class ModelStore(Protocol):
 class Inference(Protocol):
    model_store: ModelStore

-    @webmethod(route="/inference/completion")
+    @webmethod(route="/inference/completion", method="POST")
    async def completion(
        self,
        model_id: str,
@ -325,7 +305,7 @@ class Inference(Protocol):
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...

-    @webmethod(route="/inference/chat-completion")
+    @webmethod(route="/inference/chat-completion", method="POST")
    async def chat_completion(
        self,
        model_id: str,
@ -334,7 +314,7 @@ class Inference(Protocol):
        # zero-shot tool definitions as input to the model
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
@ -342,7 +322,7 @@ class Inference(Protocol):
        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
    ]: ...

-    @webmethod(route="/inference/embeddings")
+    @webmethod(route="/inference/embeddings", method="POST")
    async def embeddings(
        self,
        model_id: str,
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict, List, Protocol, runtime_checkable
+from typing import List, Protocol, runtime_checkable

 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
@ -12,6 +12,7 @@ from pydantic import BaseModel

@json_schema_type
 class ProviderInfo(BaseModel):
+    api: str
    provider_id: str
    provider_type: str

@ -34,13 +35,21 @@ class VersionInfo(BaseModel):
    version: str


+class ListProvidersResponse(BaseModel):
+    data: List[ProviderInfo]
+
+
+class ListRoutesResponse(BaseModel):
+    data: List[RouteInfo]
+
+
@runtime_checkable
 class Inspect(Protocol):
-    @webmethod(route="/providers/list", method="GET")
-    async def list_providers(self) -> Dict[str, ProviderInfo]: ...
+    @webmethod(route="/inspect/providers", method="GET")
+    async def list_providers(self) -> ListProvidersResponse: ...

-    @webmethod(route="/routes/list", method="GET")
-    async def list_routes(self) -> Dict[str, List[RouteInfo]]: ...
+    @webmethod(route="/inspect/routes", method="GET")
+    async def list_routes(self) -> ListRoutesResponse: ...

    @webmethod(route="/health", method="GET")
    async def health(self) -> HealthInfo: ...
--- a/llama_stack/apis/memory_banks/memory_banks.py
+++ b/llama_stack/apis/memory_banks/memory_banks.py
@ -1,152 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import (
-    Annotated,
-    List,
-    Literal,
-    Optional,
-    Protocol,
-    runtime_checkable,
-    Union,
-)
-
-from llama_models.schema_utils import json_schema_type, webmethod
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
-
-
-@json_schema_type
-class MemoryBankType(Enum):
-    vector = "vector"
-    keyvalue = "keyvalue"
-    keyword = "keyword"
-    graph = "graph"
-
-
-# define params for each type of memory bank, this leads to a tagged union
-# accepted as input from the API or from the config.
-@json_schema_type
-class VectorMemoryBankParams(BaseModel):
-    memory_bank_type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
-    embedding_model: str
-    chunk_size_in_tokens: int
-    overlap_size_in_tokens: Optional[int] = None
-
-
-@json_schema_type
-class KeyValueMemoryBankParams(BaseModel):
-    memory_bank_type: Literal[MemoryBankType.keyvalue.value] = (
-        MemoryBankType.keyvalue.value
-    )
-
-
-@json_schema_type
-class KeywordMemoryBankParams(BaseModel):
-    memory_bank_type: Literal[MemoryBankType.keyword.value] = (
-        MemoryBankType.keyword.value
-    )
-
-
-@json_schema_type
-class GraphMemoryBankParams(BaseModel):
-    memory_bank_type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
-
-
-BankParams = Annotated[
-    Union[
-        VectorMemoryBankParams,
-        KeyValueMemoryBankParams,
-        KeywordMemoryBankParams,
-        GraphMemoryBankParams,
-    ],
-    Field(discriminator="memory_bank_type"),
-]
-
-
-# Some common functionality for memory banks.
-class MemoryBankResourceMixin(Resource):
-    type: Literal[ResourceType.memory_bank.value] = ResourceType.memory_bank.value
-
-    @property
-    def memory_bank_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_memory_bank_id(self) -> str:
-        return self.provider_resource_id
-
-
-@json_schema_type
-class VectorMemoryBank(MemoryBankResourceMixin):
-    memory_bank_type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
-    embedding_model: str
-    chunk_size_in_tokens: int
-    embedding_dimension: Optional[int] = 384  # default to minilm-l6-v2
-    overlap_size_in_tokens: Optional[int] = None
-
-
-@json_schema_type
-class KeyValueMemoryBank(MemoryBankResourceMixin):
-    memory_bank_type: Literal[MemoryBankType.keyvalue.value] = (
-        MemoryBankType.keyvalue.value
-    )
-
-
-# TODO: KeyValue and Keyword are so similar in name, oof. Get a better naming convention.
-@json_schema_type
-class KeywordMemoryBank(MemoryBankResourceMixin):
-    memory_bank_type: Literal[MemoryBankType.keyword.value] = (
-        MemoryBankType.keyword.value
-    )
-
-
-@json_schema_type
-class GraphMemoryBank(MemoryBankResourceMixin):
-    memory_bank_type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
-
-
-MemoryBank = Annotated[
-    Union[
-        VectorMemoryBank,
-        KeyValueMemoryBank,
-        KeywordMemoryBank,
-        GraphMemoryBank,
-    ],
-    Field(discriminator="memory_bank_type"),
-]
-
-
-class MemoryBankInput(BaseModel):
-    memory_bank_id: str
-    params: BankParams
-    provider_memory_bank_id: Optional[str] = None
-
-
-@runtime_checkable
-@trace_protocol
-class MemoryBanks(Protocol):
-    @webmethod(route="/memory-banks/list", method="GET")
-    async def list_memory_banks(self) -> List[MemoryBank]: ...
-
-    @webmethod(route="/memory-banks/get", method="GET")
-    async def get_memory_bank(self, memory_bank_id: str) -> Optional[MemoryBank]: ...
-
-    @webmethod(route="/memory-banks/register", method="POST")
-    async def register_memory_bank(
-        self,
-        memory_bank_id: str,
-        params: BankParams,
-        provider_id: Optional[str] = None,
-        provider_memory_bank_id: Optional[str] = None,
-    ) -> MemoryBank: ...
-
-    @webmethod(route="/memory-banks/unregister", method="POST")
-    async def unregister_memory_bank(self, memory_bank_id: str) -> None: ...
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -52,16 +52,23 @@ class ModelInput(CommonModelFields):
    model_config = ConfigDict(protected_namespaces=())


+class ListModelsResponse(BaseModel):
+    data: List[Model]
+
+
@runtime_checkable
@trace_protocol
 class Models(Protocol):
-    @webmethod(route="/models/list", method="GET")
-    async def list_models(self) -> List[Model]: ...
+    @webmethod(route="/models", method="GET")
+    async def list_models(self) -> ListModelsResponse: ...

-    @webmethod(route="/models/get", method="GET")
-    async def get_model(self, identifier: str) -> Optional[Model]: ...
+    @webmethod(route="/models/{model_id}", method="GET")
+    async def get_model(
+        self,
+        model_id: str,
+    ) -> Optional[Model]: ...

-    @webmethod(route="/models/register", method="POST")
+    @webmethod(route="/models", method="POST")
    async def register_model(
        self,
        model_id: str,
@ -71,5 +78,8 @@ class Models(Protocol):
        model_type: Optional[ModelType] = None,
    ) -> Model: ...

-    @webmethod(route="/models/unregister", method="POST")
-    async def unregister_model(self, model_id: str) -> None: ...
+    @webmethod(route="/models/{model_id}", method="DELETE")
+    async def unregister_model(
+        self,
+        model_id: str,
+    ) -> None: ...
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -6,16 +6,13 @@

 from datetime import datetime
 from enum import Enum
-
 from typing import Any, Dict, List, Literal, Optional, Protocol, Union

-from llama_models.schema_utils import json_schema_type, webmethod
-
+from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated

 from llama_stack.apis.common.content_types import URL
-
 from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.common.training_types import Checkpoint

@ -27,11 +24,18 @@ class OptimizerType(Enum):
    sgd = "sgd"


+@json_schema_type
+class DatasetFormat(Enum):
+    instruct = "instruct"
+    dialog = "dialog"
+
+
@json_schema_type
 class DataConfig(BaseModel):
    dataset_id: str
    batch_size: int
    shuffle: bool
+    data_format: DatasetFormat
    validation_dataset_id: Optional[str] = None
    packed: Optional[bool] = False
    train_on_input: Optional[bool] = False
@ -84,9 +88,12 @@ class QATFinetuningConfig(BaseModel):
    group_size: int


-AlgorithmConfig = Annotated[
-    Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
-]
+AlgorithmConfig = register_schema(
+    Annotated[
+        Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
+    ],
+    name="AlgorithmConfig",
+)


@json_schema_type
@ -152,6 +159,10 @@ class PostTrainingJobStatusResponse(BaseModel):
    checkpoints: List[Checkpoint] = Field(default_factory=list)


+class ListPostTrainingJobsResponse(BaseModel):
+    data: List[PostTrainingJob]
+
+
@json_schema_type
 class PostTrainingJobArtifactsResponse(BaseModel):
    """Artifacts of a finetuning job."""
@ -190,7 +201,7 @@ class PostTraining(Protocol):
    ) -> PostTrainingJob: ...

    @webmethod(route="/post-training/jobs", method="GET")
-    async def get_training_jobs(self) -> List[PostTrainingJob]: ...
+    async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...

    @webmethod(route="/post-training/job/status", method="GET")
    async def get_training_job_status(
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@ -14,7 +14,7 @@ from pydantic import BaseModel, Field
 class ResourceType(Enum):
    model = "model"
    shield = "shield"
-    memory_bank = "memory_bank"
+    vector_db = "vector_db"
    dataset = "dataset"
    scoring_function = "scoring_function"
    eval_task = "eval_task"
@ -37,5 +37,5 @@ class Resource(BaseModel):
    provider_id: str = Field(description="ID of the provider that owns this resource")

    type: ResourceType = Field(
-        description="Type of resource (e.g. 'model', 'shield', 'memory_bank', etc.)"
+        description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)"
    )
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -12,7 +12,6 @@ from pydantic import BaseModel, Field

 from llama_stack.apis.inference import Message
 from llama_stack.apis.shields import Shield
-
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol


@ -49,7 +48,7 @@ class ShieldStore(Protocol):
 class Safety(Protocol):
    shield_store: ShieldStore

-    @webmethod(route="/safety/run-shield")
+    @webmethod(route="/safety/run-shield", method="POST")
    async def run_shield(
        self,
        shield_id: str,
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -11,7 +11,6 @@ from pydantic import BaseModel

 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams

-
 # mapping of metric to value
 ScoringResultRow = Dict[str, Any]

@ -43,7 +42,7 @@ class ScoringFunctionStore(Protocol):
 class Scoring(Protocol):
    scoring_function_store: ScoringFunctionStore

-    @webmethod(route="/scoring/score-batch")
+    @webmethod(route="/scoring/score-batch", method="POST")
    async def score_batch(
        self,
        dataset_id: str,
@ -51,7 +50,7 @@ class Scoring(Protocol):
        save_results_dataset: bool = False,
    ) -> ScoreBatchResponse: ...

-    @webmethod(route="/scoring/score")
+    @webmethod(route="/scoring/score", method="POST")
    async def score(
        self,
        input_rows: List[Dict[str, Any]],
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -16,12 +16,11 @@ from typing import (
    Union,
 )

-from llama_models.schema_utils import json_schema_type, webmethod
+from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated

 from llama_stack.apis.common.type_system import ParamType
-
 from llama_stack.apis.resource import Resource, ResourceType


@ -83,14 +82,17 @@ class BasicScoringFnParams(BaseModel):
    )


-ScoringFnParams = Annotated[
-    Union[
-        LLMAsJudgeScoringFnParams,
-        RegexParserScoringFnParams,
-        BasicScoringFnParams,
+ScoringFnParams = register_schema(
+    Annotated[
+        Union[
+            LLMAsJudgeScoringFnParams,
+            RegexParserScoringFnParams,
+            BasicScoringFnParams,
+        ],
+        Field(discriminator="type"),
    ],
-    Field(discriminator="type"),
-]
+    name="ScoringFnParams",
+)


 class CommonScoringFnFields(BaseModel):
@ -129,15 +131,21 @@ class ScoringFnInput(CommonScoringFnFields, BaseModel):
    provider_scoring_fn_id: Optional[str] = None


+class ListScoringFunctionsResponse(BaseModel):
+    data: List[ScoringFn]
+
+
@runtime_checkable
 class ScoringFunctions(Protocol):
-    @webmethod(route="/scoring-functions/list", method="GET")
-    async def list_scoring_functions(self) -> List[ScoringFn]: ...
+    @webmethod(route="/scoring-functions", method="GET")
+    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...

-    @webmethod(route="/scoring-functions/get", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str) -> Optional[ScoringFn]: ...
+    @webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET")
+    async def get_scoring_function(
+        self, scoring_fn_id: str, /
+    ) -> Optional[ScoringFn]: ...

-    @webmethod(route="/scoring-functions/register", method="POST")
+    @webmethod(route="/scoring-functions", method="POST")
    async def register_scoring_function(
        self,
        scoring_fn_id: str,
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/nvidia/build.yaml`