Merge remote-tracking branch 'origin/main' into stores

2025-10-21 16:07:16 +00:00 · 2025-10-13 11:07:11 -07:00 · 2025-10-13 11:07:11 -07:00 · b72154ce5e
commit b72154ce5e
parent a385e0d95e 968c364a3e
1161 changed files with 609896 additions and 42960 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -54,6 +54,10 @@ runs:
          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
        fi

+        echo "=== Running command ==="
+        echo "uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS"
+        echo ""
+
        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log


@ -62,11 +66,11 @@ runs:
      shell: bash
      run: |
        echo "Checking for recording changes"
-        git status --porcelain tests/integration/recordings/
+        git status --porcelain tests/integration/

-        if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
+        if [[ -n $(git status --porcelain tests/integration/) ]]; then
          echo "New recordings detected, committing and pushing"
-          git add tests/integration/recordings/
+          git add tests/integration/

          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -43,9 +43,9 @@ jobs:
      # Check if we should skip conformance testing due to breaking changes
      - name: Check if conformance test should be skipped
        id: skip-check
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
        run: |
-          PR_TITLE="${{ github.event.pull_request.title }}"
-
          # Skip if title contains "!:" indicating breaking change (like "feat!:")
          if [[ "$PR_TITLE" == *"!:"* ]]; then
            echo "skip=true" >> $GITHUB_OUTPUT
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -85,14 +85,15 @@ jobs:
          cat $run_dir/run.yaml

          # avoid line breaks in the server log, especially because we grep it below.
-          export COLUMNS=1984
-          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
+          export LLAMA_STACK_LOG_WIDTH=200
+          nohup uv run llama stack run $run_dir/run.yaml > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
-            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
+            # Note: /v1/health does not require authentication
+            if curl -s -L http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
@ -111,4 +112,27 @@ jobs:

      - name: Test auth
        run: |
-          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
+          echo "Testing /v1/version without token (should succeed)..."
+          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/version | grep -q "200"; then
+            echo "/v1/version accessible without token (200)"
+          else
+            echo "/v1/version returned non-200 status without token"
+            exit 1
+          fi
+
+          echo "Testing /v1/providers without token (should fail with 401)..."
+          if curl -s -L -o /dev/null -w "%{http_code}" http://127.0.0.1:8321/v1/providers | grep -q "401"; then
+            echo "/v1/providers blocked without token (401)"
+          else
+            echo "/v1/providers did not return 401 without token"
+            exit 1
+          fi
+
+          echo "Testing /v1/providers with valid token (should succeed)..."
+          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers | jq
+          if [ $? -eq 0 ]; then
+            echo "/v1/providers accessible with valid token"
+          else
+            echo "/v1/providers failed with valid token"
+            exit 1
+          fi
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -54,7 +54,7 @@ jobs:
        # Define (setup, suite) pairs - they are always matched and cannot be independent
        # Weekly schedule (Sun 1 AM): vllm+base
        # Input test-setup=ollama-vision: ollama-vision+vision
-        # Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
+        # Default (including test-setup=ollama): ollama+base, ollama-vision+vision, gpt+responses
        config: >-
          ${{
            github.event.schedule == '1 0 * * 0'
@ -79,6 +79,8 @@ jobs:

      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
+        env:
+          OPENAI_API_KEY: dummy
        with:
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          setup: ${{ matrix.config.setup }}
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -18,7 +18,7 @@ jobs:
    steps:
      - name: Check comment author and get PR details
        id: check_author
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
@ -78,7 +78,7 @@ jobs:

      - name: React to comment
        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
@ -91,7 +91,7 @@ jobs:

      - name: Comment starting
        if: steps.check_author.outputs.authorized == 'true'
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
@ -189,7 +189,7 @@ jobs:

      - name: Comment success with changes
        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
@ -202,7 +202,7 @@ jobs:

      - name: Comment success without changes
        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
@ -215,7 +215,7 @@ jobs:

      - name: Comment failure
        if: failure()
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -112,7 +112,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -150,7 +150,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
+      uses: astral-sh/setup-uv@eb1897b8dc4b5d5bfe39a428a8f2304605e0983c # v7.0.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
@ -43,7 +43,5 @@ jobs:
        uv pip list
        uv pip show llama-stack
        command -v llama
-        llama model prompt-format -m Llama3.2-90B-Vision-Instruct
-        llama model list
        llama stack list-apis
        llama stack list-providers inference
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -61,6 +61,9 @@ jobs:

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
+        env:
+          # Set OPENAI_API_KEY if using gpt setup
+          OPENAI_API_KEY: ${{ inputs.test-setup == 'gpt' && secrets.OPENAI_API_KEY || '' }}
        with:
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          setup: ${{ inputs.test-setup || 'ollama' }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -24,7 +24,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
-        uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+        uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -59,7 +59,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ramalama-stack-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+          nohup llama stack run tests/external/ramalama-stack/run.yaml > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -59,7 +59,7 @@ jobs:
          # Use the virtual environment created by the build step (name comes from build config)
          source ci-test/bin/activate
          uv pip list
-          nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
+          nohup llama stack run tests/external/run-byoa.yaml > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
        run: |
--- a/README.md
+++ b/README.md
@ -25,7 +25,7 @@ pip install -U llama_stack

 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
-llama model download --source meta --model-id $MODEL --meta-url <META_URL>
+huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL

 # start a llama stack server
 INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
--- a/docs/docs/advanced_apis/post_training.mdx
+++ b/docs/docs/advanced_apis/post_training.mdx
@ -52,7 +52,7 @@ You can access the HuggingFace trainer via the `starter` distribution:

 ```bash
 llama stack build --distro starter --image-type venv
-llama stack run --image-type venv ~/.llama/distributions/starter/starter-run.yaml
+llama stack run ~/.llama/distributions/starter/starter-run.yaml
 ```

 ### Usage Example
--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@ -219,13 +219,10 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 <TabItem value="setup" label="Setup & Configuration">

 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
-2. [Optional] Provide the API key directly to the Llama Stack server
+2. [Optional] Set the API key in your environment before starting the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
-```bash
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
-```

 </TabItem>
 <TabItem value="implementation" label="Implementation">
@ -273,9 +270,9 @@ for log in EventLogger().log(response):
 <TabItem value="setup" label="Setup & Configuration">

 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
-2. Provide the API key either when starting the Llama Stack server:
+2. Provide the API key either by setting it in your environment before starting the Llama Stack server:
    ```bash
-    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
+    export WOLFRAM_ALPHA_API_KEY="your key"
    ```
    or from the client side:
    ```python
--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -357,7 +357,7 @@ server:
 8. Run the server:

 ```bash
-python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
+llama stack run ~/.llama/run-byoa.yaml
 ```

 9. Test the API:
--- a/docs/docs/contributing/new_api_provider.mdx
+++ b/docs/docs/contributing/new_api_provider.mdx
@ -76,7 +76,7 @@ Integration tests are located in [tests/integration](https://github.com/meta-lla
 Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.

 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
- typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
+ typically references some environment variables for specifying API keys and the like. You can set these in the environment before running the test command.


 ### 2. Unit Testing
--- a/docs/docs/contributing/testing/record-replay.mdx
+++ b/docs/docs/contributing/testing/record-replay.mdx
@ -68,7 +68,9 @@ recordings/
 Direct API calls with no recording or replay:

 ```python
-with inference_recording(mode=InferenceMode.LIVE):
+from llama_stack.testing.api_recorder import api_recording, APIRecordingMode
+
+with api_recording(mode=APIRecordingMode.LIVE):
    response = await client.chat.completions.create(...)
 ```

@ -79,7 +81,7 @@ Use for initial development and debugging against real APIs.
 Captures API interactions while passing through real responses:

 ```python
-with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.RECORD, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # Real API call made, response captured AND returned
 ```
@ -96,7 +98,7 @@ The recording process:
 Returns stored responses instead of making API calls:

 ```python
-with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
+with api_recording(mode=APIRecordingMode.REPLAY, storage_dir="./recordings"):
    response = await client.chat.completions.create(...)
    # No API call made, cached response returned instantly
 ```
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -170,7 +170,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
+        command: ["llama", "stack", "run", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/docs/distributions/building_distro.mdx
+++ b/docs/docs/distributions/building_distro.mdx
@ -289,10 +289,10 @@ After this step is successful, you should be able to find the built container im
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e OLLAMA_URL=http://host.docker.internal:11434 \
  localhost/distribution-ollama:dev \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env OLLAMA_URL=http://host.docker.internal:11434
+  --port $LLAMA_STACK_PORT
 ```

 Here are the docker flags and their uses:
@ -305,12 +305,12 @@ Here are the docker flags and their uses:

 * `localhost/distribution-ollama:dev`: The name and tag of the container image to run

+* `-e INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the INFERENCE_MODEL environment variable in the container
+
+* `-e OLLAMA_URL=http://host.docker.internal:11434`: Sets the OLLAMA_URL environment variable in the container
+
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on

-* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
-
-* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
-
 </TabItem>
 </Tabs>

@ -320,23 +320,22 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con

 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
                       [--image-type {venv}] [--enable-ui]
-                       [config | template]
+                       [config | distro]

 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.

 positional arguments:
-  config | template     Path to config file to use for the run or name of known template (`llama stack list` for a list). (default: None)
+  config | distro       Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)

 options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
-                        Name of the image to run. Defaults to the current environment (default: None)
-  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
+                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
  --image-type {venv}
-                        Image Type used during the build. This should be venv. (default: None)
+                        [DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
  --enable-ui           Start the UI server (default: False)
 ```

@ -348,9 +347,6 @@ llama stack run tgi

 # Start using config file
 llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
-
-# Start using a venv
-llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```

 ```
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -101,7 +101,7 @@ A few things to note:
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
+- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server, you can set environment variables in your shell before running `llama stack run` to override the default values.

 ### Environment Variable Substitution

@ -173,13 +173,10 @@ optional_token: ${env.OPTIONAL_TOKEN:+}

 #### Runtime Override

-You can override environment variables at runtime when starting the server:
+You can override environment variables at runtime by setting them in your shell before starting the server:

 ```bash
-# Override specific environment variables
-llama stack run --config run.yaml --env API_KEY=sk-123 --env BASE_URL=https://custom-api.com
-
-# Or set them in your shell
+# Set environment variables in your shell
 export API_KEY=sk-123
 export BASE_URL=https://custom-api.com
 llama stack run --config run.yaml
--- a/docs/docs/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/stack-k8s.yaml.template
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/docs/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/docs/distributions/remote_hosted_distro/watsonx.md
@ -69,10 +69,10 @@ docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
+  -e WATSONX_API_KEY=$WATSONX_API_KEY \
+  -e WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
+  -e WATSONX_BASE_URL=$WATSONX_BASE_URL \
  llamastack/distribution-watsonx \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env WATSONX_API_KEY=$WATSONX_API_KEY \
-  --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
-  --env WATSONX_BASE_URL=$WATSONX_BASE_URL
+  --port $LLAMA_STACK_PORT
 ```
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -129,11 +129,11 @@ docker run -it \
  # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
  -v $HOME/git/llama-stack:/app/llama-stack-source -v $HOME/git/llama-models:/app/llama-models-source \
  # localhost/distribution-dell:dev if building / testing locally
-  llamastack/distribution-dell\
-  --port $LLAMA_STACK_PORT  \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-dell \
+  --port $LLAMA_STACK_PORT

 ```

@ -154,14 +154,14 @@ docker run \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e SAFETY_MODEL=$SAFETY_MODEL \
+  -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  -e CHROMA_URL=$CHROMA_URL \
  llamastack/distribution-dell \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```

 ### Via venv
@ -170,21 +170,21 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a

 ```bash
 llama stack build --distro dell --image-type venv
-llama stack run dell
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env CHROMA_URL=$CHROMA_URL
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+CHROMA_URL=$CHROMA_URL \
+llama stack run dell \
+  --port $LLAMA_STACK_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+SAFETY_MODEL=$SAFETY_MODEL \
+DEH_SAFETY_URL=$DEH_SAFETY_URL \
+CHROMA_URL=$CHROMA_URL \
 llama stack run ./run-with-safety.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env DEH_URL=$DEH_URL \
-  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env DEH_SAFETY_URL=$DEH_SAFETY_URL \
-  --env CHROMA_URL=$CHROMA_URL
+  --port $LLAMA_STACK_PORT
 ```
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -41,31 +41,7 @@ The following environment variables can be configured:

 ## Prerequisite: Downloading Models

-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-
-```
-$ llama model list --downloaded
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
 ```

 ## Running the Distribution
@ -84,9 +60,9 @@ docker run \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port $LLAMA_STACK_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:
@ -98,10 +74,10 @@ docker run \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
  llamastack/distribution-meta-reference-gpu \
-  --port $LLAMA_STACK_PORT \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port $LLAMA_STACK_PORT
 ```

 ### Via venv
@ -110,16 +86,16 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL

 ```bash
 llama stack build --distro meta-reference-gpu --image-type venv
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
 llama stack run distributions/meta-reference-gpu/run.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+  --port 8321
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
 llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
-  --port 8321 \
-  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+  --port 8321
 ```
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -129,10 +129,10 @@ docker run \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
  --config /root/my-run.yaml \
-  --port $LLAMA_STACK_PORT \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
+  --port $LLAMA_STACK_PORT
 ```

 ### Via venv
@ -142,10 +142,10 @@ If you've set up your local development environment, you can also build the imag
 ```bash
 INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 llama stack build --distro nvidia --image-type venv
+NVIDIA_API_KEY=$NVIDIA_API_KEY \
+INFERENCE_MODEL=$INFERENCE_MODEL \
 llama stack run ./run.yaml \
-  --port 8321 \
-  --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
-  --env INFERENCE_MODEL=$INFERENCE_MODEL
+  --port 8321
 ```

 ## Example Notebooks
--- a/docs/docs/getting_started/detailed_tutorial.mdx
+++ b/docs/docs/getting_started/detailed_tutorial.mdx
@ -86,9 +86,9 @@ docker run -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
+  -e OLLAMA_URL=http://host.docker.internal:11434 \
  llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT \
-  --env OLLAMA_URL=http://host.docker.internal:11434
+  --port $LLAMA_STACK_PORT
 ```
 Note to start the container with Podman, you can do the same but replace `docker` at the start of the command with
 `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL`
@ -106,9 +106,9 @@ docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  --network=host \
+  -e OLLAMA_URL=http://localhost:11434 \
  llamastack/distribution-starter \
-  --port $LLAMA_STACK_PORT \
-  --env OLLAMA_URL=http://localhost:11434
+  --port $LLAMA_STACK_PORT
 ```
 :::
 You will see output like below:
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -1,4 +1,7 @@
 ---
+description: "Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs."
 sidebar_label: Files
 title: Files
 ---
@ -7,4 +10,8 @@ title: Files

 ## Overview

+Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs.
+
 This section contains documentation for all available providers for the **files** API.
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -1,5 +1,7 @@
 ---
-description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
+description: "Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
@ -12,7 +14,9 @@ title: Inference

 ## Overview

-Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
--- a/docs/docs/providers/inference/remote_anthropic.mdx
+++ b/docs/docs/providers/inference/remote_anthropic.mdx
@ -15,7 +15,8 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `str \| None` | No |  | API key for Anthropic models |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -22,7 +22,8 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Azure API key for Azure |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -15,6 +15,7 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -15,8 +15,9 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
-| `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Cerebras API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -15,8 +15,9 @@ Databricks inference provider for running models on Databricks' unified analytic
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `url` | `<class 'str'>` | No |  | The URL for the Databricks model serving endpoint |
-| `api_token` | `<class 'pydantic.types.SecretStr'>` | No |  | The Databricks API token |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The Databricks API token |
+| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_fireworks.mdx
+++ b/docs/docs/providers/inference/remote_fireworks.mdx
@ -15,8 +15,9 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The Fireworks.ai API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_gemini.mdx
+++ b/docs/docs/providers/inference/remote_gemini.mdx
@ -15,7 +15,8 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `str \| None` | No |  | API key for Gemini models |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -15,7 +15,8 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `str \| None` | No |  | The Groq API key |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -15,7 +15,8 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `str \| None` | No |  | The Llama API key |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -15,8 +15,9 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The NVIDIA API key, only needed of using the hosted service |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
 | `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |

--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -15,8 +15,8 @@ Ollama inference provider for running local models through the Ollama runtime.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `url` | `<class 'str'>` | No | http://localhost:11434 |  |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -15,7 +15,8 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `api_key` | `str \| None` | No |  | API key for OpenAI models |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -15,8 +15,9 @@ Passthrough inference provider for connecting to any external inference service
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |
+| `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -15,8 +15,9 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
 | `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
-| `api_token` | `str \| None` | No |  | The API token |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -15,8 +15,9 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The SambaNova cloud API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -15,6 +15,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_together.mdx
+++ b/docs/docs/providers/inference/remote_together.mdx
@ -15,8 +15,9 @@ Together AI inference provider for open-source models and collaborative AI devel
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The Together AI API Key |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_vertexai.mdx
+++ b/docs/docs/providers/inference/remote_vertexai.mdx
@ -54,6 +54,7 @@ Available Models:
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
 | `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |

--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -15,11 +15,11 @@ Remote vLLM inference provider for connecting to vLLM servers.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
 | `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
 | `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
-| `api_token` | `str \| None` | No | fake | The API token |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -15,9 +15,10 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
-| `project_id` | `str \| None` | No |  | The Project ID key |
+| `project_id` | `str \| None` | No |  | The watsonx.ai project ID |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration
--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@ -1,4 +1,7 @@
 ---
+description: "Safety
+
+    OpenAI-compatible Moderations API."
 sidebar_label: Safety
 title: Safety
 ---
@ -7,4 +10,8 @@ title: Safety

 ## Overview

+Safety
+
+    OpenAI-compatible Moderations API.
+
 This section contains documentation for all available providers for the **safety** API.
--- a/docs/docs/providers/safety/remote_bedrock.mdx
+++ b/docs/docs/providers/safety/remote_bedrock.mdx
@ -15,6 +15,7 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/references/llama_cli_reference/download_models.md
+++ b/docs/docs/references/llama_cli_reference/download_models.md
@ -25,141 +25,42 @@ You have two ways to install Llama Stack:
    cd llama-stack
    pip install -e .

-## Downloading models via CLI
+## Downloading models via Hugging Face CLI

-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.

-To download any model you need the **Model Descriptor**.
-This can be obtained by running the command
-```
-llama model list
-```
+### Install Hugging Face CLI

-You should see a table like this:
-
-```
-+----------------------------------+------------------------------------------+----------------+
-| Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
-+----------------------------------+------------------------------------------+----------------+
-```
-
-To download models, you can use the llama download command.
-
-#### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
-
-Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/). Note: You need to quote the META_URL
-
-Download the required checkpoints using the following commands:
+First, install the Hugging Face CLI:
 ```bash
-# download the 8B model, this can be run on a single GPU
-llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url 'META_URL'
-
-# you can also get the 70B model, this will require 8 GPUs however
-llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url 'META_URL'
-
-# llama-agents have safety enabled by default. For this, you will need
-# safety models -- Llama-Guard and Prompt-Guard
-llama download --source meta --model-id Prompt-Guard-86M --meta-url 'META_URL'
-llama download --source meta --model-id Llama-Guard-3-1B --meta-url 'META_URL'
+pip install huggingface_hub[cli]
 ```

-#### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
+### Download models from Hugging Face

-Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
+You can download models using the `huggingface-cli download` command. Here are some examples:

 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 3B Instruct model
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct

-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
+# Download Llama 3.2 1B Instruct model
+huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --local-dir ~/.llama/Llama-3.2-1B-Instruct

-llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
-llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
-```
-
-**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-
-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+# Download Llama Guard 3 1B model
+huggingface-cli download meta-llama/Llama-Guard-3-1B --local-dir ~/.llama/Llama-Guard-3-1B
+
+# Download Prompt Guard model
+huggingface-cli download meta-llama/Prompt-Guard-86M --local-dir ~/.llama/Prompt-Guard-86M
 ```

+**Important:** You need to authenticate with Hugging Face to download models. You can do this by:
+1. Getting your token from [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+2. Running `huggingface-cli login` and entering your token
 ## List the downloaded models

-To list the downloaded models with the following command:
-```
-llama model list --downloaded
-```
-
-You should see a table like this:
-```
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
+To list the downloaded models, you can use the Hugging Face CLI:
+```bash
+# List all downloaded models in your local cache
+huggingface-cli scan-cache
 ```
--- a/docs/docs/references/llama_cli_reference/index.md
+++ b/docs/docs/references/llama_cli_reference/index.md
@ -27,9 +27,9 @@ You have two ways to install Llama Stack:


 ## `llama` subcommands
-1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
-2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
-3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
+1. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../distributions/building_distro) documentation.
+
+For downloading models, we recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli). See [Downloading models](#downloading-models) for more information.

 ### Sample Usage

@ -38,239 +38,41 @@ llama --help
 ```

 ```
-usage: llama [-h] {download,model,stack} ...
+usage: llama [-h] {stack} ...

 Welcome to the Llama CLI

 options:
-  -h, --help            show this help message and exit
+  -h, --help  show this help message and exit

 subcommands:
-  {download,model,stack}
+  {stack}
+
+  stack                 Operations for the Llama Stack / Distributions
 ```

 ## Downloading models

-You first need to have models downloaded locally.
+You first need to have models downloaded locally. We recommend using the [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli) to download models.

-To download any model you need the **Model Descriptor**.
-This can be obtained by running the command
-```
-llama model list
-```
-
-You should see a table like this:
-
-```
-+----------------------------------+------------------------------------------+----------------+
-| Model Descriptor(ID)             | Hugging Face Repo                        | Context Length |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B                      | meta-llama/Llama-3.1-8B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B                     | meta-llama/Llama-3.1-70B                 | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp8           | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B                    | meta-llama/Llama-3.1-405B-FP8            | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B:bf16-mp16          | meta-llama/Llama-3.1-405B                | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-8B-Instruct             | meta-llama/Llama-3.1-8B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-70B-Instruct            | meta-llama/Llama-3.1-70B-Instruct        | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp8  | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct           | meta-llama/Llama-3.1-405B-Instruct-FP8   | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct       | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B                      | meta-llama/Llama-3.2-1B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B                      | meta-llama/Llama-3.2-3B                  | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision              | meta-llama/Llama-3.2-11B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision              | meta-llama/Llama-3.2-90B-Vision          | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-1B-Instruct             | meta-llama/Llama-3.2-1B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-3B-Instruct             | meta-llama/Llama-3.2-3B-Instruct         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-11B-Vision-Instruct     | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama3.2-90B-Vision-Instruct     | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-11B-Vision         | meta-llama/Llama-Guard-3-11B-Vision      | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B:int4-mp1        | meta-llama/Llama-Guard-3-1B-INT4         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-1B                 | meta-llama/Llama-Guard-3-1B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B                 | meta-llama/Llama-Guard-3-8B              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-3-8B:int8-mp1        | meta-llama/Llama-Guard-3-8B-INT8         | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Prompt-Guard-86M                 | meta-llama/Prompt-Guard-86M              | 128K           |
-+----------------------------------+------------------------------------------+----------------+
-| Llama-Guard-2-8B                 | meta-llama/Llama-Guard-2-8B              | 4K             |
-+----------------------------------+------------------------------------------+----------------+
-```
-
-To download models, you can use the `llama download` command.
-
-### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
-
-Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
-
-Download the required checkpoints using the following commands:
+First, install the Hugging Face CLI:
 ```bash
-# download the 8B model, this can be run on a single GPU
-llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url META_URL
-
-# you can also get the 70B model, this will require 8 GPUs however
-llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url META_URL
-
-# llama-agents have safety enabled by default. For this, you will need
-# safety models -- Llama-Guard and Prompt-Guard
-llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
-llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
+pip install huggingface_hub[cli]
 ```

-### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
-
-Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
-
+Then authenticate and download models:
 ```bash
-llama download --source huggingface --model-id  Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
+# Authenticate with Hugging Face
+huggingface-cli login

-llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
-
-llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
-llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
-```
-
-**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
-
-```{tip}
-Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+# Download a model
+huggingface-cli download meta-llama/Llama-3.2-3B-Instruct --local-dir ~/.llama/Llama-3.2-3B-Instruct
 ```

 ## List the downloaded models

-To list the downloaded models with the following command:
-```
-llama model list --downloaded
-```
-
-You should see a table like this:
-```
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
-┃ Model                                   ┃ Size     ┃ Modified Time       ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
-│ Llama3.2-1B-Instruct:int4-qlora-eo8     │ 1.53 GB  │ 2025-02-26 11:22:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B                             │ 2.31 GB  │ 2025-02-18 21:48:52 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Prompt-Guard-86M                        │ 0.02 GB  │ 2025-02-26 11:29:28 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB  │ 2025-02-26 11:37:41 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-3B                             │ 5.99 GB  │ 2025-02-18 21:51:26 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.1-8B                             │ 14.97 GB │ 2025-02-16 10:36:37 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB  │ 2025-02-26 11:35:02 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B                        │ 2.80 GB  │ 2025-02-26 11:20:46 │
-├─────────────────────────────────────────┼──────────┼─────────────────────┤
-│ Llama-Guard-3-1B:int4                   │ 0.43 GB  │ 2025-02-26 11:33:33 │
-└─────────────────────────────────────────┴──────────┴─────────────────────┘
-```
-
-
-## Understand the models
-The `llama model` command helps you explore the model’s interface.
-
-1. `download`: Download the model from different sources. (meta, huggingface)
-2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
-3. `prompt-format`: Show llama model message formats.
-4. `describe`: Describes all the properties of the model.
-
-### Sample Usage
-
-`llama model <subcommand> <options>`
-
-```
-llama model --help
-```
-```
-usage: llama model [-h] {download,list,prompt-format,describe,verify-download,remove} ...
-
-Work with llama models
-
-options:
-  -h, --help            show this help message and exit
-
-model_subcommands:
-  {download,list,prompt-format,describe,verify-download,remove}
-```
-
-### Describe
-
-You can use the describe command to know more about a model:
-```
-llama model describe -m Llama3.2-3B-Instruct
-```
-```
-+-----------------------------+----------------------------------+
-| Model                       | Llama3.2-3B-Instruct             |
-+-----------------------------+----------------------------------+
-| Hugging Face ID             | meta-llama/Llama-3.2-3B-Instruct |
-+-----------------------------+----------------------------------+
-| Description                 | Llama 3.2 3b instruct model      |
-+-----------------------------+----------------------------------+
-| Context Length              | 128K tokens                      |
-+-----------------------------+----------------------------------+
-| Weights format              | bf16                             |
-+-----------------------------+----------------------------------+
-| Model params.json           | {                                |
-|                             |     "dim": 3072,                 |
-|                             |     "n_layers": 28,              |
-|                             |     "n_heads": 24,               |
-|                             |     "n_kv_heads": 8,             |
-|                             |     "vocab_size": 128256,        |
-|                             |     "ffn_dim_multiplier": 1.0,   |
-|                             |     "multiple_of": 256,          |
-|                             |     "norm_eps": 1e-05,           |
-|                             |     "rope_theta": 500000.0,      |
-|                             |     "use_scaled_rope": true      |
-|                             | }                                |
-+-----------------------------+----------------------------------+
-| Recommended sampling params | {                                |
-|                             |     "temperature": 1.0,          |
-|                             |     "top_p": 0.9,                |
-|                             |     "top_k": 0                   |
-|                             | }                                |
-+-----------------------------+----------------------------------+
-```
-
-### Prompt Format
-You can even run `llama model prompt-format` see all of the templates and their tokens:
-
-```
-llama model prompt-format -m Llama3.2-3B-Instruct
-```
-![alt text](/img/prompt-format.png)
-
-
-You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
-
-**NOTE**: Outputs in terminal are color printed to show special tokens.
-
-### Remove model
-You can run `llama model remove` to remove an unnecessary model:
-
-```
-llama model remove -m Llama-Guard-3-8B-int8
+To list the downloaded models, you can use the Hugging Face CLI:
+```bash
+# List all downloaded models in your local cache
+huggingface-cli scan-cache
 ```
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -123,12 +123,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
-        "!uv run --with llama-stack llama stack build --distro together --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro together\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        \"uv run --with llama-stack llama stack run together --image-type venv\",\n",
+        "        \"uv run --with llama-stack llama stack run together\",\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -51,11 +51,11 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "!pip install uv\n",
+        "!pip install uv \"huggingface_hub[cli]\"\n",
        "\n",
        "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
        "# get meta url from llama.com\n",
-        "!uv run --with llama-stack llama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
+        "huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL\n",
        "\n",
        "model_id = f\"meta-llama/{MODEL}\""
      ]
@ -233,12 +233,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server\n",
-        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro meta-reference-gpu\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"uv run --with llama-stack llama stack run meta-reference-gpu --image-type venv --env INFERENCE_MODEL={model_id}\",\n",
+        "        f\"INFERENCE_MODEL={model_id} uv run --with llama-stack llama stack run meta-reference-gpu\",\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@ -223,12 +223,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server\n",
-        "!uv run --with llama-stack llama stack build --distro llama_api --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro llama_api\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        \"uv run --with llama-stack llama stack run llama_api --image-type venv\",\n",
+        "        \"uv run --with llama-stack llama stack run llama_api\",\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -23,6 +23,7 @@ from llama_stack.strong_typing.inspection import (
    is_generic_list,
    is_type_optional,
    is_type_union,
+    is_unwrapped_body_param,
    unwrap_generic_list,
    unwrap_optional_type,
    unwrap_union_types,
@ -769,24 +770,30 @@ class Generator:
            first = next(iter(op.request_params))
            request_name, request_type = first

-            op_name = "".join(word.capitalize() for word in op.name.split("_"))
-            request_name = f"{op_name}Request"
-            fields = [
-                (
-                    name,
-                    type_,
-                )
-                for name, type_ in op.request_params
-            ]
-            request_type = make_dataclass(
-                request_name,
-                fields,
-                namespace={
-                    "__doc__": create_docstring_for_request(
-                        request_name, fields, doc_params
+            # Special case: if there's a single parameter with Body(embed=False) that's a BaseModel,
+            # unwrap it to show the flat structure in the OpenAPI spec
+            # Example: openai_chat_completion()
+            if (len(op.request_params) == 1 and is_unwrapped_body_param(request_type)):
+                pass
+            else:
+                op_name = "".join(word.capitalize() for word in op.name.split("_"))
+                request_name = f"{op_name}Request"
+                fields = [
+                    (
+                        name,
+                        type_,
                    )
-                },
-            )
+                    for name, type_ in op.request_params
+                ]
+                request_type = make_dataclass(
+                    request_name,
+                    fields,
+                    namespace={
+                        "__doc__": create_docstring_for_request(
+                            request_name, fields, doc_params
+                        )
+                    },
+                )

            requestBody = RequestBody(
                content={
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -8,10 +8,11 @@ import json
 import typing
 import inspect
 from pathlib import Path
-from typing import TextIO
-from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
+from typing import Any, List, Optional, TextIO, Union, get_type_hints, get_origin, get_args

+from pydantic import BaseModel
 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
+from llama_stack.strong_typing.inspection import is_unwrapped_body_param
 from llama_stack.core.resolver import api_protocol_map

 from .generator import Generator
@ -205,6 +206,14 @@ def _validate_has_return_in_docstring(method) -> str | None:
 def _validate_has_params_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    sig = inspect.signature(method)
+
+    params_list = [p for p in sig.parameters.values() if p.name != "self"]
+    if len(params_list) == 1:
+        param = params_list[0]
+        param_type = param.annotation
+        if is_unwrapped_body_param(param_type):
+            return
+
    # Only check if the method has more than one parameter
    if len(sig.parameters) > 1 and ":param" not in source:
        return "does not have a ':param' in its docstring"
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -145,12 +145,12 @@
        "  del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
        "\n",
        "# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
-        "!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
+        "!uv run --with llama-stack llama stack build --distro starter\n",
        "\n",
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv\n",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -88,7 +88,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   ...
   Build Successful!
   You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
-   You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter --image-type venv
+   You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter
   ```

 3. **Set the ENV variables by exporting them to the terminal**:
@ -102,12 +102,11 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 3. **Run the Llama Stack**:
   Run the stack using uv:
   ```bash
+   INFERENCE_MODEL=$INFERENCE_MODEL \
+   SAFETY_MODEL=$SAFETY_MODEL \
+   OLLAMA_URL=$OLLAMA_URL \
   uv run --with llama-stack llama stack run starter \
-      --image-type venv \
-      --port $LLAMA_STACK_PORT \
-      --env INFERENCE_MODEL=$INFERENCE_MODEL \
-      --env SAFETY_MODEL=$SAFETY_MODEL \
-      --env OLLAMA_URL=$OLLAMA_URL
+      --port $LLAMA_STACK_PORT
   ```
   Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.

--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -797,7 +797,7 @@ class Agents(Protocol):
        self,
        response_id: str,
    ) -> OpenAIResponseObject:
-        """Retrieve an OpenAI response by its ID.
+        """Get a model response.

        :param response_id: The ID of the OpenAI response to retrieve.
        :returns: An OpenAIResponseObject.
@ -812,6 +812,7 @@ class Agents(Protocol):
        model: str,
        instructions: str | None = None,
        previous_response_id: str | None = None,
+        conversation: str | None = None,
        store: bool | None = True,
        stream: bool | None = False,
        temperature: float | None = None,
@ -826,11 +827,12 @@ class Agents(Protocol):
            ),
        ] = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
-        """Create a new OpenAI response.
+        """Create a model response.

        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
+        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
        :param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications.
        :returns: An OpenAIResponseObject.
@ -846,7 +848,7 @@ class Agents(Protocol):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseObject:
-        """List all OpenAI responses.
+        """List all responses.

        :param after: The ID of the last response to return.
        :param limit: The number of responses to return.
@ -869,7 +871,7 @@ class Agents(Protocol):
        limit: int | None = 20,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseInputItem:
-        """List input items for a given OpenAI response.
+        """List input items.

        :param response_id: The ID of the response to retrieve input items for.
        :param after: An item ID to list items after, used for pagination.
@ -884,7 +886,7 @@ class Agents(Protocol):
    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
-        """Delete an OpenAI response by its ID.
+        """Delete a response.

        :param response_id: The ID of the OpenAI response to delete.
        :returns: An OpenAIDeleteResponseObject
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -346,6 +346,174 @@ class OpenAIResponseText(BaseModel):
    format: OpenAIResponseTextFormat | None = None


+# Must match type Literals of OpenAIResponseInputToolWebSearch below
+WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
+
+
+@json_schema_type
+class OpenAIResponseInputToolWebSearch(BaseModel):
+    """Web search tool configuration for OpenAI response inputs.
+
+    :param type: Web search tool type variant to use
+    :param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
+    """
+
+    # Must match values of WebSearchToolTypes above
+    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
+        "web_search"
+    )
+    # TODO: actually use search_context_size somewhere...
+    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
+    # TODO: add user_location
+
+
+@json_schema_type
+class OpenAIResponseInputToolFunction(BaseModel):
+    """Function tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "function"
+    :param name: Name of the function that can be called
+    :param description: (Optional) Description of what the function does
+    :param parameters: (Optional) JSON schema defining the function's parameters
+    :param strict: (Optional) Whether to enforce strict parameter validation
+    """
+
+    type: Literal["function"] = "function"
+    name: str
+    description: str | None = None
+    parameters: dict[str, Any] | None
+    strict: bool | None = None
+
+
+@json_schema_type
+class OpenAIResponseInputToolFileSearch(BaseModel):
+    """File search tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "file_search"
+    :param vector_store_ids: List of vector store identifiers to search within
+    :param filters: (Optional) Additional filters to apply to the search
+    :param max_num_results: (Optional) Maximum number of search results to return (1-50)
+    :param ranking_options: (Optional) Options for ranking and scoring search results
+    """
+
+    type: Literal["file_search"] = "file_search"
+    vector_store_ids: list[str]
+    filters: dict[str, Any] | None = None
+    max_num_results: int | None = Field(default=10, ge=1, le=50)
+    ranking_options: FileSearchRankingOptions | None = None
+
+
+class ApprovalFilter(BaseModel):
+    """Filter configuration for MCP tool approval requirements.
+
+    :param always: (Optional) List of tool names that always require approval
+    :param never: (Optional) List of tool names that never require approval
+    """
+
+    always: list[str] | None = None
+    never: list[str] | None = None
+
+
+class AllowedToolsFilter(BaseModel):
+    """Filter configuration for restricting which MCP tools can be used.
+
+    :param tool_names: (Optional) List of specific tool names that are allowed
+    """
+
+    tool_names: list[str] | None = None
+
+
+@json_schema_type
+class OpenAIResponseInputToolMCP(BaseModel):
+    """Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "mcp"
+    :param server_label: Label to identify this MCP server
+    :param server_url: URL endpoint of the MCP server
+    :param headers: (Optional) HTTP headers to include when connecting to the server
+    :param require_approval: Approval requirement for tool calls ("always", "never", or filter)
+    :param allowed_tools: (Optional) Restriction on which tools can be used from this server
+    """
+
+    type: Literal["mcp"] = "mcp"
+    server_label: str
+    server_url: str
+    headers: dict[str, Any] | None = None
+
+    require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
+    allowed_tools: list[str] | AllowedToolsFilter | None = None
+
+
+OpenAIResponseInputTool = Annotated[
+    OpenAIResponseInputToolWebSearch
+    | OpenAIResponseInputToolFileSearch
+    | OpenAIResponseInputToolFunction
+    | OpenAIResponseInputToolMCP,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
+
+
+@json_schema_type
+class OpenAIResponseToolMCP(BaseModel):
+    """Model Context Protocol (MCP) tool configuration for OpenAI response object.
+
+    :param type: Tool type identifier, always "mcp"
+    :param server_label: Label to identify this MCP server
+    :param allowed_tools: (Optional) Restriction on which tools can be used from this server
+    """
+
+    type: Literal["mcp"] = "mcp"
+    server_label: str
+    allowed_tools: list[str] | AllowedToolsFilter | None = None
+
+
+OpenAIResponseTool = Annotated[
+    OpenAIResponseInputToolWebSearch
+    | OpenAIResponseInputToolFileSearch
+    | OpenAIResponseInputToolFunction
+    | OpenAIResponseToolMCP,  # The only type that differes from that in the inputs is the MCP tool
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseTool, name="OpenAIResponseTool")
+
+
+class OpenAIResponseUsageOutputTokensDetails(BaseModel):
+    """Token details for output tokens in OpenAI response usage.
+
+    :param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models)
+    """
+
+    reasoning_tokens: int | None = None
+
+
+class OpenAIResponseUsageInputTokensDetails(BaseModel):
+    """Token details for input tokens in OpenAI response usage.
+
+    :param cached_tokens: Number of tokens retrieved from cache
+    """
+
+    cached_tokens: int | None = None
+
+
+@json_schema_type
+class OpenAIResponseUsage(BaseModel):
+    """Usage information for OpenAI response.
+
+    :param input_tokens: Number of tokens in the input
+    :param output_tokens: Number of tokens in the output
+    :param total_tokens: Total tokens used (input + output)
+    :param input_tokens_details: Detailed breakdown of input token usage
+    :param output_tokens_details: Detailed breakdown of output token usage
+    """
+
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+    input_tokens_details: OpenAIResponseUsageInputTokensDetails | None = None
+    output_tokens_details: OpenAIResponseUsageOutputTokensDetails | None = None
+
+
@json_schema_type
 class OpenAIResponseObject(BaseModel):
    """Complete OpenAI response object containing generation results and metadata.
@ -362,7 +530,9 @@ class OpenAIResponseObject(BaseModel):
    :param temperature: (Optional) Sampling temperature used for generation
    :param text: Text formatting configuration for the response
    :param top_p: (Optional) Nucleus sampling parameter used for generation
+    :param tools: (Optional) An array of tools the model may call while generating a response.
    :param truncation: (Optional) Truncation strategy applied to the response
+    :param usage: (Optional) Token usage information for the response
    """

    created_at: int
@ -379,7 +549,9 @@ class OpenAIResponseObject(BaseModel):
    # before the field was added. New responses will have this set always.
    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
    top_p: float | None = None
+    tools: list[OpenAIResponseTool] | None = None
    truncation: str | None = None
+    usage: OpenAIResponseUsage | None = None


@json_schema_type
@ -400,7 +572,7 @@ class OpenAIDeleteResponseObject(BaseModel):
 class OpenAIResponseObjectStreamResponseCreated(BaseModel):
    """Streaming event indicating a new response has been created.

-    :param response: The newly created response object
+    :param response: The response object that was created
    :param type: Event type identifier, always "response.created"
    """

@ -408,11 +580,25 @@ class OpenAIResponseObjectStreamResponseCreated(BaseModel):
    type: Literal["response.created"] = "response.created"


+@json_schema_type
+class OpenAIResponseObjectStreamResponseInProgress(BaseModel):
+    """Streaming event indicating the response remains in progress.
+
+    :param response: Current response state while in progress
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.in_progress"
+    """
+
+    response: OpenAIResponseObject
+    sequence_number: int
+    type: Literal["response.in_progress"] = "response.in_progress"
+
+
@json_schema_type
 class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
    """Streaming event indicating a response has been completed.

-    :param response: The completed response object
+    :param response: Completed response object
    :param type: Event type identifier, always "response.completed"
    """

@ -420,6 +606,34 @@ class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
    type: Literal["response.completed"] = "response.completed"


+@json_schema_type
+class OpenAIResponseObjectStreamResponseIncomplete(BaseModel):
+    """Streaming event emitted when a response ends in an incomplete state.
+
+    :param response: Response object describing the incomplete state
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.incomplete"
+    """
+
+    response: OpenAIResponseObject
+    sequence_number: int
+    type: Literal["response.incomplete"] = "response.incomplete"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFailed(BaseModel):
+    """Streaming event emitted when a response fails.
+
+    :param response: Response object describing the failure
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.failed"
+    """
+
+    response: OpenAIResponseObject
+    sequence_number: int
+    type: Literal["response.failed"] = "response.failed"
+
+
@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
    """Streaming event for when a new output item is added to the response.
@ -650,19 +864,46 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):

@json_schema_type
 class OpenAIResponseContentPartOutputText(BaseModel):
+    """Text content within a streamed response part.
+
+    :param type: Content part type identifier, always "output_text"
+    :param text: Text emitted for this content part
+    :param annotations: Structured annotations associated with the text
+    :param logprobs: (Optional) Token log probability details
+    """
+
    type: Literal["output_text"] = "output_text"
    text: str
-    # TODO: add annotations, logprobs, etc.
+    annotations: list[OpenAIResponseAnnotations] = Field(default_factory=list)
+    logprobs: list[dict[str, Any]] | None = None


@json_schema_type
 class OpenAIResponseContentPartRefusal(BaseModel):
+    """Refusal content within a streamed response part.
+
+    :param type: Content part type identifier, always "refusal"
+    :param refusal: Refusal text supplied by the model
+    """
+
    type: Literal["refusal"] = "refusal"
    refusal: str


+@json_schema_type
+class OpenAIResponseContentPartReasoningText(BaseModel):
+    """Reasoning text emitted as part of a streamed response.
+
+    :param type: Content part type identifier, always "reasoning_text"
+    :param text: Reasoning text supplied by the model
+    """
+
+    type: Literal["reasoning_text"] = "reasoning_text"
+    text: str
+
+
 OpenAIResponseContentPart = Annotated[
-    OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
+    OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
@ -672,15 +913,19 @@ register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
 class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
    """Streaming event for when a new content part is added to a response item.

+    :param content_index: Index position of the part within the content array
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
+    :param output_index: Index position of the output item in the response
    :param part: The content part that was added
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.added"
    """

+    content_index: int
    response_id: str
    item_id: str
+    output_index: int
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.added"] = "response.content_part.added"
@ -690,22 +935,269 @@ class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
 class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
    """Streaming event for when a content part is completed.

+    :param content_index: Index position of the part within the content array
    :param response_id: Unique identifier of the response containing this content
    :param item_id: Unique identifier of the output item containing this content part
+    :param output_index: Index position of the output item in the response
    :param part: The completed content part
    :param sequence_number: Sequential number for ordering streaming events
    :param type: Event type identifier, always "response.content_part.done"
    """

+    content_index: int
    response_id: str
    item_id: str
+    output_index: int
    part: OpenAIResponseContentPart
    sequence_number: int
    type: Literal["response.content_part.done"] = "response.content_part.done"


+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningTextDelta(BaseModel):
+    """Streaming event for incremental reasoning text updates.
+
+    :param content_index: Index position of the reasoning content part
+    :param delta: Incremental reasoning text being added
+    :param item_id: Unique identifier of the output item being updated
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.reasoning_text.delta"
+    """
+
+    content_index: int
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.reasoning_text.delta"] = "response.reasoning_text.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningTextDone(BaseModel):
+    """Streaming event for when reasoning text is completed.
+
+    :param content_index: Index position of the reasoning content part
+    :param text: Final complete reasoning text
+    :param item_id: Unique identifier of the completed output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.reasoning_text.done"
+    """
+
+    content_index: int
+    text: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.reasoning_text.done"] = "response.reasoning_text.done"
+
+
+@json_schema_type
+class OpenAIResponseContentPartReasoningSummary(BaseModel):
+    """Reasoning summary part in a streamed response.
+
+    :param type: Content part type identifier, always "summary_text"
+    :param text: Summary text
+    """
+
+    type: Literal["summary_text"] = "summary_text"
+    text: str
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryPartAdded(BaseModel):
+    """Streaming event for when a new reasoning summary part is added.
+
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param part: The summary part that was added
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_part.added"
+    """
+
+    item_id: str
+    output_index: int
+    part: OpenAIResponseContentPartReasoningSummary
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_part.added"] = "response.reasoning_summary_part.added"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryPartDone(BaseModel):
+    """Streaming event for when a reasoning summary part is completed.
+
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param part: The completed summary part
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_part.done"
+    """
+
+    item_id: str
+    output_index: int
+    part: OpenAIResponseContentPartReasoningSummary
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_part.done"] = "response.reasoning_summary_part.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryTextDelta(BaseModel):
+    """Streaming event for incremental reasoning summary text updates.
+
+    :param delta: Incremental summary text being added
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_text.delta"
+    """
+
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_text.delta"] = "response.reasoning_summary_text.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseReasoningSummaryTextDone(BaseModel):
+    """Streaming event for when reasoning summary text is completed.
+
+    :param text: Final complete summary text
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the output item
+    :param sequence_number: Sequential number for ordering streaming events
+    :param summary_index: Index of the summary part within the reasoning summary
+    :param type: Event type identifier, always "response.reasoning_summary_text.done"
+    """
+
+    text: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    summary_index: int
+    type: Literal["response.reasoning_summary_text.done"] = "response.reasoning_summary_text.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseRefusalDelta(BaseModel):
+    """Streaming event for incremental refusal text updates.
+
+    :param content_index: Index position of the content part
+    :param delta: Incremental refusal text being added
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.refusal.delta"
+    """
+
+    content_index: int
+    delta: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.refusal.delta"] = "response.refusal.delta"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseRefusalDone(BaseModel):
+    """Streaming event for when refusal text is completed.
+
+    :param content_index: Index position of the content part
+    :param refusal: Final complete refusal text
+    :param item_id: Unique identifier of the output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.refusal.done"
+    """
+
+    content_index: int
+    refusal: str
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.refusal.done"] = "response.refusal.done"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseOutputTextAnnotationAdded(BaseModel):
+    """Streaming event for when an annotation is added to output text.
+
+    :param item_id: Unique identifier of the item to which the annotation is being added
+    :param output_index: Index position of the output item in the response's output array
+    :param content_index: Index position of the content part within the output item
+    :param annotation_index: Index of the annotation within the content part
+    :param annotation: The annotation object being added
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_text.annotation.added"
+    """
+
+    item_id: str
+    output_index: int
+    content_index: int
+    annotation_index: int
+    annotation: OpenAIResponseAnnotations
+    sequence_number: int
+    type: Literal["response.output_text.annotation.added"] = "response.output_text.annotation.added"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallInProgress(BaseModel):
+    """Streaming event for file search calls in progress.
+
+    :param item_id: Unique identifier of the file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.in_progress"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.in_progress"] = "response.file_search_call.in_progress"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallSearching(BaseModel):
+    """Streaming event for file search currently searching.
+
+    :param item_id: Unique identifier of the file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.searching"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.searching"] = "response.file_search_call.searching"
+
+
+@json_schema_type
+class OpenAIResponseObjectStreamResponseFileSearchCallCompleted(BaseModel):
+    """Streaming event for completed file search calls.
+
+    :param item_id: Unique identifier of the completed file search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.file_search_call.completed"
+    """
+
+    item_id: str
+    output_index: int
+    sequence_number: int
+    type: Literal["response.file_search_call.completed"] = "response.file_search_call.completed"
+
+
 OpenAIResponseObjectStream = Annotated[
    OpenAIResponseObjectStreamResponseCreated
+    | OpenAIResponseObjectStreamResponseInProgress
    | OpenAIResponseObjectStreamResponseOutputItemAdded
    | OpenAIResponseObjectStreamResponseOutputItemDone
    | OpenAIResponseObjectStreamResponseOutputTextDelta
@ -725,6 +1217,20 @@ OpenAIResponseObjectStream = Annotated[
    | OpenAIResponseObjectStreamResponseMcpCallCompleted
    | OpenAIResponseObjectStreamResponseContentPartAdded
    | OpenAIResponseObjectStreamResponseContentPartDone
+    | OpenAIResponseObjectStreamResponseReasoningTextDelta
+    | OpenAIResponseObjectStreamResponseReasoningTextDone
+    | OpenAIResponseObjectStreamResponseReasoningSummaryPartAdded
+    | OpenAIResponseObjectStreamResponseReasoningSummaryPartDone
+    | OpenAIResponseObjectStreamResponseReasoningSummaryTextDelta
+    | OpenAIResponseObjectStreamResponseReasoningSummaryTextDone
+    | OpenAIResponseObjectStreamResponseRefusalDelta
+    | OpenAIResponseObjectStreamResponseRefusalDone
+    | OpenAIResponseObjectStreamResponseOutputTextAnnotationAdded
+    | OpenAIResponseObjectStreamResponseFileSearchCallInProgress
+    | OpenAIResponseObjectStreamResponseFileSearchCallSearching
+    | OpenAIResponseObjectStreamResponseFileSearchCallCompleted
+    | OpenAIResponseObjectStreamResponseIncomplete
+    | OpenAIResponseObjectStreamResponseFailed
    | OpenAIResponseObjectStreamResponseCompleted,
    Field(discriminator="type"),
 ]
@ -760,114 +1266,6 @@ OpenAIResponseInput = Annotated[
 register_schema(OpenAIResponseInput, name="OpenAIResponseInput")


-# Must match type Literals of OpenAIResponseInputToolWebSearch below
-WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
-
-
-@json_schema_type
-class OpenAIResponseInputToolWebSearch(BaseModel):
-    """Web search tool configuration for OpenAI response inputs.
-
-    :param type: Web search tool type variant to use
-    :param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
-    """
-
-    # Must match values of WebSearchToolTypes above
-    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
-        "web_search"
-    )
-    # TODO: actually use search_context_size somewhere...
-    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
-    # TODO: add user_location
-
-
-@json_schema_type
-class OpenAIResponseInputToolFunction(BaseModel):
-    """Function tool configuration for OpenAI response inputs.
-
-    :param type: Tool type identifier, always "function"
-    :param name: Name of the function that can be called
-    :param description: (Optional) Description of what the function does
-    :param parameters: (Optional) JSON schema defining the function's parameters
-    :param strict: (Optional) Whether to enforce strict parameter validation
-    """
-
-    type: Literal["function"] = "function"
-    name: str
-    description: str | None = None
-    parameters: dict[str, Any] | None
-    strict: bool | None = None
-
-
-@json_schema_type
-class OpenAIResponseInputToolFileSearch(BaseModel):
-    """File search tool configuration for OpenAI response inputs.
-
-    :param type: Tool type identifier, always "file_search"
-    :param vector_store_ids: List of vector store identifiers to search within
-    :param filters: (Optional) Additional filters to apply to the search
-    :param max_num_results: (Optional) Maximum number of search results to return (1-50)
-    :param ranking_options: (Optional) Options for ranking and scoring search results
-    """
-
-    type: Literal["file_search"] = "file_search"
-    vector_store_ids: list[str]
-    filters: dict[str, Any] | None = None
-    max_num_results: int | None = Field(default=10, ge=1, le=50)
-    ranking_options: FileSearchRankingOptions | None = None
-
-
-class ApprovalFilter(BaseModel):
-    """Filter configuration for MCP tool approval requirements.
-
-    :param always: (Optional) List of tool names that always require approval
-    :param never: (Optional) List of tool names that never require approval
-    """
-
-    always: list[str] | None = None
-    never: list[str] | None = None
-
-
-class AllowedToolsFilter(BaseModel):
-    """Filter configuration for restricting which MCP tools can be used.
-
-    :param tool_names: (Optional) List of specific tool names that are allowed
-    """
-
-    tool_names: list[str] | None = None
-
-
-@json_schema_type
-class OpenAIResponseInputToolMCP(BaseModel):
-    """Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
-
-    :param type: Tool type identifier, always "mcp"
-    :param server_label: Label to identify this MCP server
-    :param server_url: URL endpoint of the MCP server
-    :param headers: (Optional) HTTP headers to include when connecting to the server
-    :param require_approval: Approval requirement for tool calls ("always", "never", or filter)
-    :param allowed_tools: (Optional) Restriction on which tools can be used from this server
-    """
-
-    type: Literal["mcp"] = "mcp"
-    server_label: str
-    server_url: str
-    headers: dict[str, Any] | None = None
-
-    require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
-    allowed_tools: list[str] | AllowedToolsFilter | None = None
-
-
-OpenAIResponseInputTool = Annotated[
-    OpenAIResponseInputToolWebSearch
-    | OpenAIResponseInputToolFileSearch
-    | OpenAIResponseInputToolFunction
-    | OpenAIResponseInputToolMCP,
-    Field(discriminator="type"),
-]
-register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
-
-
 class ListOpenAIResponseInputItem(BaseModel):
    """List container for OpenAI response input items.

--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -86,3 +86,18 @@ class TokenValidationError(ValueError):

    def __init__(self, message: str) -> None:
        super().__init__(message)
+
+
+class ConversationNotFoundError(ResourceNotFoundError):
+    """raised when Llama Stack cannot find a referenced conversation"""
+
+    def __init__(self, conversation_id: str) -> None:
+        super().__init__(conversation_id, "Conversation", "client.conversations.list()")
+
+
+class InvalidConversationIdError(ValueError):
+    """raised when a conversation ID has an invalid format"""
+
+    def __init__(self, conversation_id: str) -> None:
+        message = f"Invalid conversation ID '{conversation_id}'. Expected an ID that begins with 'conv_'."
+        super().__init__(message)
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -96,7 +96,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
    :cvar telemetry: Observability and system monitoring
    :cvar models: Model metadata and management
    :cvar shields: Safety shield implementations
-    :cvar vector_dbs: Vector database management
    :cvar datasets: Dataset creation and management
    :cvar scoring_functions: Scoring function definitions
    :cvar benchmarks: Benchmark suite management
@ -122,7 +121,6 @@ class Api(Enum, metaclass=DynamicApiMeta):

    models = "models"
    shields = "shields"
-    vector_dbs = "vector_dbs"
    datasets = "datasets"
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -104,6 +104,11 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Files(Protocol):
+    """Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs.
+    """
+
    # OpenAI Files API Endpoints
    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
@ -113,7 +118,8 @@ class Files(Protocol):
        purpose: Annotated[OpenAIFilePurpose, Form()],
        expires_after: Annotated[ExpiresAfter | None, Form()] = None,
    ) -> OpenAIFileObject:
-        """
+        """Upload file.
+
        Upload a file that can be used across various endpoints.

        The file upload should be a multipart form request with:
@ -137,7 +143,8 @@ class Files(Protocol):
        order: Order | None = Order.desc,
        purpose: OpenAIFilePurpose | None = None,
    ) -> ListOpenAIFileResponse:
-        """
+        """List files.
+
        Returns a list of files that belong to the user's organization.

        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
@ -154,7 +161,8 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> OpenAIFileObject:
-        """
+        """Retrieve file.
+
        Returns information about a specific file.

        :param file_id: The ID of the file to use for this request.
@ -168,8 +176,7 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> OpenAIFileDeleteResponse:
-        """
-        Delete a file.
+        """Delete file.

        :param file_id: The ID of the file to use for this request.
        :returns: An OpenAIFileDeleteResponse indicating successful deletion.
@ -182,7 +189,8 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> Response:
-        """
+        """Retrieve file content.
+
        Returns the contents of the specified file.

        :param file_id: The ID of the file to use for this request.
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -14,6 +14,7 @@ from typing import (
    runtime_checkable,
 )

+from fastapi import Body
 from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict

@ -776,12 +777,14 @@ class OpenAIChoiceDelta(BaseModel):
    :param refusal: (Optional) The refusal of the delta
    :param role: (Optional) The role of the delta
    :param tool_calls: (Optional) The tool calls of the delta
+    :param reasoning_content: (Optional) The reasoning content from the model (non-standard, for o1/o3 models)
    """

    content: str | None = None
    refusal: str | None = None
    role: str | None = None
    tool_calls: list[OpenAIChatCompletionToolCall] | None = None
+    reasoning_content: str | None = None


@json_schema_type
@ -816,6 +819,42 @@ class OpenAIChoice(BaseModel):
    logprobs: OpenAIChoiceLogprobs | None = None


+class OpenAIChatCompletionUsageCompletionTokensDetails(BaseModel):
+    """Token details for output tokens in OpenAI chat completion usage.
+
+    :param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models)
+    """
+
+    reasoning_tokens: int | None = None
+
+
+class OpenAIChatCompletionUsagePromptTokensDetails(BaseModel):
+    """Token details for prompt tokens in OpenAI chat completion usage.
+
+    :param cached_tokens: Number of tokens retrieved from cache
+    """
+
+    cached_tokens: int | None = None
+
+
+@json_schema_type
+class OpenAIChatCompletionUsage(BaseModel):
+    """Usage information for OpenAI chat completion.
+
+    :param prompt_tokens: Number of tokens in the prompt
+    :param completion_tokens: Number of tokens in the completion
+    :param total_tokens: Total tokens used (prompt + completion)
+    :param input_tokens_details: Detailed breakdown of input token usage
+    :param output_tokens_details: Detailed breakdown of output token usage
+    """
+
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    prompt_tokens_details: OpenAIChatCompletionUsagePromptTokensDetails | None = None
+    completion_tokens_details: OpenAIChatCompletionUsageCompletionTokensDetails | None = None
+
+
@json_schema_type
 class OpenAIChatCompletion(BaseModel):
    """Response from an OpenAI-compatible chat completion request.
@ -825,6 +864,7 @@ class OpenAIChatCompletion(BaseModel):
    :param object: The object type, which will be "chat.completion"
    :param created: The Unix timestamp in seconds when the chat completion was created
    :param model: The model that was used to generate the chat completion
+    :param usage: Token usage information for the completion
    """

    id: str
@ -832,6 +872,7 @@ class OpenAIChatCompletion(BaseModel):
    object: Literal["chat.completion"] = "chat.completion"
    created: int
    model: str
+    usage: OpenAIChatCompletionUsage | None = None


@json_schema_type
@ -843,6 +884,7 @@ class OpenAIChatCompletionChunk(BaseModel):
    :param object: The object type, which will be "chat.completion.chunk"
    :param created: The Unix timestamp in seconds when the chat completion was created
    :param model: The model that was used to generate the chat completion
+    :param usage: Token usage information (typically included in final chunk with stream_options)
    """

    id: str
@ -850,6 +892,7 @@ class OpenAIChatCompletionChunk(BaseModel):
    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
    created: int
    model: str
+    usage: OpenAIChatCompletionUsage | None = None


@json_schema_type
@ -995,6 +1038,127 @@ class ListOpenAIChatCompletionResponse(BaseModel):
    object: Literal["list"] = "list"


+# extra_body can be accessed via .model_extra
+@json_schema_type
+class OpenAICompletionRequestWithExtraBody(BaseModel, extra="allow"):
+    """Request parameters for OpenAI-compatible completion endpoint.
+
+    :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+    :param prompt: The prompt to generate a completion for.
+    :param best_of: (Optional) The number of completions to generate.
+    :param echo: (Optional) Whether to echo the prompt.
+    :param frequency_penalty: (Optional) The penalty for repeated tokens.
+    :param logit_bias: (Optional) The logit bias to use.
+    :param logprobs: (Optional) The log probabilities to use.
+    :param max_tokens: (Optional) The maximum number of tokens to generate.
+    :param n: (Optional) The number of completions to generate.
+    :param presence_penalty: (Optional) The penalty for repeated tokens.
+    :param seed: (Optional) The seed to use.
+    :param stop: (Optional) The stop tokens to use.
+    :param stream: (Optional) Whether to stream the response.
+    :param stream_options: (Optional) The stream options to use.
+    :param temperature: (Optional) The temperature to use.
+    :param top_p: (Optional) The top p to use.
+    :param user: (Optional) The user to use.
+    :param suffix: (Optional) The suffix that should be appended to the completion.
+    """
+
+    # Standard OpenAI completion parameters
+    model: str
+    prompt: str | list[str] | list[int] | list[list[int]]
+    best_of: int | None = None
+    echo: bool | None = None
+    frequency_penalty: float | None = None
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = None
+    max_tokens: int | None = None
+    n: int | None = None
+    presence_penalty: float | None = None
+    seed: int | None = None
+    stop: str | list[str] | None = None
+    stream: bool | None = None
+    stream_options: dict[str, Any] | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    user: str | None = None
+    suffix: str | None = None
+
+
+# extra_body can be accessed via .model_extra
+@json_schema_type
+class OpenAIChatCompletionRequestWithExtraBody(BaseModel, extra="allow"):
+    """Request parameters for OpenAI-compatible chat completion endpoint.
+
+    :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+    :param messages: List of messages in the conversation.
+    :param frequency_penalty: (Optional) The penalty for repeated tokens.
+    :param function_call: (Optional) The function call to use.
+    :param functions: (Optional) List of functions to use.
+    :param logit_bias: (Optional) The logit bias to use.
+    :param logprobs: (Optional) The log probabilities to use.
+    :param max_completion_tokens: (Optional) The maximum number of tokens to generate.
+    :param max_tokens: (Optional) The maximum number of tokens to generate.
+    :param n: (Optional) The number of completions to generate.
+    :param parallel_tool_calls: (Optional) Whether to parallelize tool calls.
+    :param presence_penalty: (Optional) The penalty for repeated tokens.
+    :param response_format: (Optional) The response format to use.
+    :param seed: (Optional) The seed to use.
+    :param stop: (Optional) The stop tokens to use.
+    :param stream: (Optional) Whether to stream the response.
+    :param stream_options: (Optional) The stream options to use.
+    :param temperature: (Optional) The temperature to use.
+    :param tool_choice: (Optional) The tool choice to use.
+    :param tools: (Optional) The tools to use.
+    :param top_logprobs: (Optional) The top log probabilities to use.
+    :param top_p: (Optional) The top p to use.
+    :param user: (Optional) The user to use.
+    """
+
+    # Standard OpenAI chat completion parameters
+    model: str
+    messages: Annotated[list[OpenAIMessageParam], Field(..., min_length=1)]
+    frequency_penalty: float | None = None
+    function_call: str | dict[str, Any] | None = None
+    functions: list[dict[str, Any]] | None = None
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = None
+    max_completion_tokens: int | None = None
+    max_tokens: int | None = None
+    n: int | None = None
+    parallel_tool_calls: bool | None = None
+    presence_penalty: float | None = None
+    response_format: OpenAIResponseFormatParam | None = None
+    seed: int | None = None
+    stop: str | list[str] | None = None
+    stream: bool | None = None
+    stream_options: dict[str, Any] | None = None
+    temperature: float | None = None
+    tool_choice: str | dict[str, Any] | None = None
+    tools: list[dict[str, Any]] | None = None
+    top_logprobs: int | None = None
+    top_p: float | None = None
+    user: str | None = None
+
+
+# extra_body can be accessed via .model_extra
+@json_schema_type
+class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):
+    """Request parameters for OpenAI-compatible embeddings endpoint.
+
+    :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+    :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
+    :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
+    :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
+    :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
+    """
+
+    model: str
+    input: str | list[str]
+    encoding_format: str | None = "float"
+    dimensions: int | None = None
+    user: str | None = None
+
+
@runtime_checkable
@trace_protocol
 class InferenceProvider(Protocol):
@ -1029,50 +1193,11 @@ class InferenceProvider(Protocol):
    @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_completion(
        self,
-        # Standard OpenAI completion parameters
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        # vLLM-specific parameters
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        # for fill-in-the-middle type completion
-        suffix: str | None = None,
+        params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
    ) -> OpenAICompletion:
-        """Generate an OpenAI-compatible completion for the given prompt using the specified model.
+        """Create completion.

-        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param prompt: The prompt to generate a completion for.
-        :param best_of: (Optional) The number of completions to generate.
-        :param echo: (Optional) Whether to echo the prompt.
-        :param frequency_penalty: (Optional) The penalty for repeated tokens.
-        :param logit_bias: (Optional) The logit bias to use.
-        :param logprobs: (Optional) The log probabilities to use.
-        :param max_tokens: (Optional) The maximum number of tokens to generate.
-        :param n: (Optional) The number of completions to generate.
-        :param presence_penalty: (Optional) The penalty for repeated tokens.
-        :param seed: (Optional) The seed to use.
-        :param stop: (Optional) The stop tokens to use.
-        :param stream: (Optional) Whether to stream the response.
-        :param stream_options: (Optional) The stream options to use.
-        :param temperature: (Optional) The temperature to use.
-        :param top_p: (Optional) The top p to use.
-        :param user: (Optional) The user to use.
-        :param suffix: (Optional) The suffix that should be appended to the completion.
+        Generate an OpenAI-compatible completion for the given prompt using the specified model.
        :returns: An OpenAICompletion.
        """
        ...
@ -1081,55 +1206,11 @@ class InferenceProvider(Protocol):
    @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_chat_completion(
        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
+        params: Annotated[OpenAIChatCompletionRequestWithExtraBody, Body(...)],
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
+        """Create chat completions.

-        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation.
-        :param frequency_penalty: (Optional) The penalty for repeated tokens.
-        :param function_call: (Optional) The function call to use.
-        :param functions: (Optional) List of functions to use.
-        :param logit_bias: (Optional) The logit bias to use.
-        :param logprobs: (Optional) The log probabilities to use.
-        :param max_completion_tokens: (Optional) The maximum number of tokens to generate.
-        :param max_tokens: (Optional) The maximum number of tokens to generate.
-        :param n: (Optional) The number of completions to generate.
-        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls.
-        :param presence_penalty: (Optional) The penalty for repeated tokens.
-        :param response_format: (Optional) The response format to use.
-        :param seed: (Optional) The seed to use.
-        :param stop: (Optional) The stop tokens to use.
-        :param stream: (Optional) Whether to stream the response.
-        :param stream_options: (Optional) The stream options to use.
-        :param temperature: (Optional) The temperature to use.
-        :param tool_choice: (Optional) The tool choice to use.
-        :param tools: (Optional) The tools to use.
-        :param top_logprobs: (Optional) The top log probabilities to use.
-        :param top_p: (Optional) The top p to use.
-        :param user: (Optional) The user to use.
+        Generate an OpenAI-compatible chat completion for the given messages using the specified model.
        :returns: An OpenAIChatCompletion.
        """
        ...
@ -1138,26 +1219,20 @@ class InferenceProvider(Protocol):
    @webmethod(route="/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_embeddings(
        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
+        params: Annotated[OpenAIEmbeddingsRequestWithExtraBody, Body(...)],
    ) -> OpenAIEmbeddingsResponse:
-        """Generate OpenAI-compatible embeddings for the given input using the specified model.
+        """Create embeddings.

-        :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
-        :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
-        :param encoding_format: (Optional) The format to return the embeddings in. Can be either "float" or "base64". Defaults to "float".
-        :param dimensions: (Optional) The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models.
-        :param user: (Optional) A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
+        Generate OpenAI-compatible embeddings for the given input using the specified model.
        :returns: An OpenAIEmbeddingsResponse containing the embeddings.
        """
        ...


 class Inference(InferenceProvider):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+    """Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
@ -1173,7 +1248,7 @@ class Inference(InferenceProvider):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIChatCompletionResponse:
-        """List all chat completions.
+        """List chat completions.

        :param after: The ID of the last chat completion to return.
        :param limit: The maximum number of chat completions to return.
@ -1188,7 +1263,9 @@ class Inference(InferenceProvider):
    )
    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
-        """Describe a chat completion by its ID.
+        """Get chat completion.
+
+        Describe a chat completion by its ID.

        :param completion_id: ID of the chat completion.
        :returns: A OpenAICompletionWithInputMessages.
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -58,25 +58,36 @@ class ListRoutesResponse(BaseModel):

@runtime_checkable
 class Inspect(Protocol):
+    """Inspect
+
+    APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
+    """
+
    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
    async def list_routes(self) -> ListRoutesResponse:
-        """List all available API routes with their methods and implementing providers.
+        """List routes.
+
+        List all available API routes with their methods and implementing providers.

        :returns: Response containing information about all available routes.
        """
        ...

-    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
    async def health(self) -> HealthInfo:
-        """Get the current health status of the service.
+        """Get health status.
+
+        Get the current health status of the service.

        :returns: Health information indicating if the service is operational.
        """
        ...

-    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
    async def version(self) -> VersionInfo:
-        """Get the version of the service.
+        """Get version.
+
+        Get the version of the service.

        :returns: Version information containing the service version number.
        """
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -124,7 +124,9 @@ class Models(Protocol):
        self,
        model_id: str,
    ) -> Model:
-        """Get a model by its identifier.
+        """Get model.
+
+        Get a model by its identifier.

        :param model_id: The identifier of the model to get.
        :returns: A Model.
@ -140,7 +142,9 @@ class Models(Protocol):
        metadata: dict[str, Any] | None = None,
        model_type: ModelType | None = None,
    ) -> Model:
-        """Register a model.
+        """Register model.
+
+        Register a model.

        :param model_id: The identifier of the model to register.
        :param provider_model_id: The identifier of the model in the provider.
@ -156,7 +160,9 @@ class Models(Protocol):
        self,
        model_id: str,
    ) -> None:
-        """Unregister a model.
+        """Unregister model.
+
+        Unregister a model.

        :param model_id: The identifier of the model to unregister.
        """
--- a/llama_stack/apis/prompts/prompts.py
+++ b/llama_stack/apis/prompts/prompts.py
@ -94,7 +94,9 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Prompts(Protocol):
-    """Protocol for prompt management operations."""
+    """Prompts
+
+    Protocol for prompt management operations."""

    @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
    async def list_prompts(self) -> ListPromptsResponse:
@ -109,7 +111,9 @@ class Prompts(Protocol):
        self,
        prompt_id: str,
    ) -> ListPromptsResponse:
-        """List all versions of a specific prompt.
+        """List prompt versions.
+
+        List all versions of a specific prompt.

        :param prompt_id: The identifier of the prompt to list versions for.
        :returns: A ListPromptsResponse containing all versions of the prompt.
@ -122,7 +126,9 @@ class Prompts(Protocol):
        prompt_id: str,
        version: int | None = None,
    ) -> Prompt:
-        """Get a prompt by its identifier and optional version.
+        """Get prompt.
+
+        Get a prompt by its identifier and optional version.

        :param prompt_id: The identifier of the prompt to get.
        :param version: The version of the prompt to get (defaults to latest).
@ -136,7 +142,9 @@ class Prompts(Protocol):
        prompt: str,
        variables: list[str] | None = None,
    ) -> Prompt:
-        """Create a new prompt.
+        """Create prompt.
+
+        Create a new prompt.

        :param prompt: The prompt text content with variable placeholders.
        :param variables: List of variable names that can be used in the prompt template.
@ -153,7 +161,9 @@ class Prompts(Protocol):
        variables: list[str] | None = None,
        set_as_default: bool = True,
    ) -> Prompt:
-        """Update an existing prompt (increments version).
+        """Update prompt.
+
+        Update an existing prompt (increments version).

        :param prompt_id: The identifier of the prompt to update.
        :param prompt: The updated prompt text content.
@ -169,7 +179,9 @@ class Prompts(Protocol):
        self,
        prompt_id: str,
    ) -> None:
-        """Delete a prompt.
+        """Delete prompt.
+
+        Delete a prompt.

        :param prompt_id: The identifier of the prompt to delete.
        """
@ -181,7 +193,9 @@ class Prompts(Protocol):
        prompt_id: str,
        version: int,
    ) -> Prompt:
-        """Set which version of a prompt should be the default in get_prompt (latest).
+        """Set prompt version.
+
+        Set which version of a prompt should be the default in get_prompt (latest).

        :param prompt_id: The identifier of the prompt.
        :param version: The version to set as default.
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@ -42,13 +42,16 @@ class ListProvidersResponse(BaseModel):

@runtime_checkable
 class Providers(Protocol):
-    """
+    """Providers
+
    Providers API for inspecting, listing, and modifying providers and their configurations.
    """

    @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
    async def list_providers(self) -> ListProvidersResponse:
-        """List all available providers.
+        """List providers.
+
+        List all available providers.

        :returns: A ListProvidersResponse containing information about all providers.
        """
@ -56,7 +59,9 @@ class Providers(Protocol):

    @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
-        """Get detailed information about a specific provider.
+        """Get provider.
+
+        Get detailed information about a specific provider.

        :param provider_id: The ID of the provider to inspect.
        :returns: A ProviderInfo object containing the provider's details.
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -9,7 +9,7 @@ from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

-from llama_stack.apis.inference import Message
+from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -96,16 +96,23 @@ class ShieldStore(Protocol):
@runtime_checkable
@trace_protocol
 class Safety(Protocol):
+    """Safety
+
+    OpenAI-compatible Moderations API.
+    """
+
    shield_store: ShieldStore

    @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
    async def run_shield(
        self,
        shield_id: str,
-        messages: list[Message],
+        messages: list[OpenAIMessageParam],
        params: dict[str, Any],
    ) -> RunShieldResponse:
-        """Run a shield.
+        """Run shield.
+
+        Run a shield.

        :param shield_id: The identifier of the shield to run.
        :param messages: The messages to run the shield on.
@ -117,7 +124,9 @@ class Safety(Protocol):
    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
-        """Classifies if text and/or image inputs are potentially harmful.
+        """Create moderation.
+
+        Classifies if text and/or image inputs are potentially harmful.
        :param input: Input (or inputs) to classify.
        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
        :param model: The content moderation model you would like to use.
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Literal, Protocol, runtime_checkable
+from typing import Literal

 from pydantic import BaseModel

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.schema_utils import json_schema_type


@json_schema_type
@ -61,57 +59,3 @@ class ListVectorDBsResponse(BaseModel):
    """

    data: list[VectorDB]
-
-
-@runtime_checkable
-@trace_protocol
-class VectorDBs(Protocol):
-    @webmethod(route="/vector-dbs", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_vector_dbs(self) -> ListVectorDBsResponse:
-        """List all vector databases.
-
-        :returns: A ListVectorDBsResponse.
-        """
-        ...
-
-    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_vector_db(
-        self,
-        vector_db_id: str,
-    ) -> VectorDB:
-        """Get a vector database by its identifier.
-
-        :param vector_db_id: The identifier of the vector database to get.
-        :returns: A VectorDB.
-        """
-        ...
-
-    @webmethod(route="/vector-dbs", method="POST", level=LLAMA_STACK_API_V1)
-    async def register_vector_db(
-        self,
-        vector_db_id: str,
-        embedding_model: str,
-        embedding_dimension: int | None = 384,
-        provider_id: str | None = None,
-        vector_db_name: str | None = None,
-        provider_vector_db_id: str | None = None,
-    ) -> VectorDB:
-        """Register a vector database.
-
-        :param vector_db_id: The identifier of the vector database to register.
-        :param embedding_model: The embedding model to use.
-        :param embedding_dimension: The dimension of the embedding model.
-        :param provider_id: The identifier of the provider.
-        :param vector_db_name: The name of the vector database.
-        :param provider_vector_db_id: The identifier of the vector database in the provider.
-        :returns: A VectorDB.
-        """
-        ...
-
-    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def unregister_vector_db(self, vector_db_id: str) -> None:
-        """Unregister a vector database.
-
-        :param vector_db_id: The identifier of the vector database to unregister.
-        """
-        ...
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -11,6 +11,7 @@
 import uuid
 from typing import Annotated, Any, Literal, Protocol, runtime_checkable

+from fastapi import Body
 from pydantic import BaseModel, Field

 from llama_stack.apis.inference import InterleavedContent
@ -466,6 +467,40 @@ class VectorStoreFilesListInBatchResponse(BaseModel):
    has_more: bool = False


+# extra_body can be accessed via .model_extra
+@json_schema_type
+class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
+    """Request to create a vector store with extra_body support.
+
+    :param name: (Optional) A name for the vector store
+    :param file_ids: List of file IDs to include in the vector store
+    :param expires_after: (Optional) Expiration policy for the vector store
+    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
+    :param metadata: Set of key-value pairs that can be attached to the vector store
+    """
+
+    name: str | None = None
+    file_ids: list[str] | None = None
+    expires_after: dict[str, Any] | None = None
+    chunking_strategy: dict[str, Any] | None = None
+    metadata: dict[str, Any] | None = None
+
+
+# extra_body can be accessed via .model_extra
+@json_schema_type
+class OpenAICreateVectorStoreFileBatchRequestWithExtraBody(BaseModel, extra="allow"):
+    """Request to create a vector store file batch with extra_body support.
+
+    :param file_ids: A list of File IDs that the vector store should use
+    :param attributes: (Optional) Key-value attributes to store with the files
+    :param chunking_strategy: (Optional) The chunking strategy used to chunk the file(s). Defaults to auto
+    """
+
+    file_ids: list[str]
+    attributes: dict[str, Any] | None = None
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
+
+
 class VectorDBStore(Protocol):
    def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...

@ -516,25 +551,11 @@ class VectorIO(Protocol):
    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_create_vector_store(
        self,
-        name: str | None = None,
-        file_ids: list[str] | None = None,
-        expires_after: dict[str, Any] | None = None,
-        chunking_strategy: dict[str, Any] | None = None,
-        metadata: dict[str, Any] | None = None,
-        embedding_model: str | None = None,
-        embedding_dimension: int | None = 384,
-        provider_id: str | None = None,
+        params: Annotated[OpenAICreateVectorStoreRequestWithExtraBody, Body(...)],
    ) -> VectorStoreObject:
        """Creates a vector store.

-        :param name: A name for the vector store.
-        :param file_ids: A list of File IDs that the vector store should use. Useful for tools like `file_search` that can access files.
-        :param expires_after: The expiration policy for a vector store.
-        :param chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto` strategy.
-        :param metadata: Set of 16 key-value pairs that can be attached to an object.
-        :param embedding_model: The embedding model to use for this vector store.
-        :param embedding_dimension: The dimension of the embedding vectors (default: 384).
-        :param provider_id: The ID of the provider to use for this vector store.
+        Generate an OpenAI-compatible vector store with the given parameters.
        :returns: A VectorStoreObject representing the created vector store.
        """
        ...
@ -827,16 +848,12 @@ class VectorIO(Protocol):
    async def openai_create_vector_store_file_batch(
        self,
        vector_store_id: str,
-        file_ids: list[str],
-        attributes: dict[str, Any] | None = None,
-        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+        params: Annotated[OpenAICreateVectorStoreFileBatchRequestWithExtraBody, Body(...)],
    ) -> VectorStoreFileBatchObject:
        """Create a vector store file batch.

+        Generate an OpenAI-compatible vector store file batch for the given vector store.
        :param vector_store_id: The ID of the vector store to create the file batch for.
-        :param file_ids: A list of File IDs that the vector store should use.
-        :param attributes: (Optional) Key-value attributes to store with the files.
-        :param chunking_strategy: (Optional) The chunking strategy used to chunk the file(s). Defaults to auto.
        :returns: A VectorStoreFileBatchObject representing the created file batch.
        """
        ...
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -1,495 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import asyncio
-import json
-import os
-import shutil
-import sys
-from dataclasses import dataclass
-from datetime import UTC, datetime
-from functools import partial
-from pathlib import Path
-
-import httpx
-from pydantic import BaseModel, ConfigDict
-from rich.console import Console
-from rich.progress import (
-    BarColumn,
-    DownloadColumn,
-    Progress,
-    TextColumn,
-    TimeRemainingColumn,
-    TransferSpeedColumn,
-)
-from termcolor import cprint
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.models.llama.sku_list import LlamaDownloadInfo
-from llama_stack.models.llama.sku_types import Model
-
-
-class Download(Subcommand):
-    """Llama cli for downloading llama toolchain assets"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "download",
-            prog="llama download",
-            description="Download a model from llama.meta.com or Hugging Face Hub",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        setup_download_parser(self.parser)
-
-
-def setup_download_parser(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--source",
-        choices=["meta", "huggingface"],
-        default="meta",
-    )
-    parser.add_argument(
-        "--model-id",
-        required=False,
-        help="See `llama model list` or `llama model list --show-all` for the list of available models. Specify multiple model IDs with commas, e.g. --model-id Llama3.2-1B,Llama3.2-3B",
-    )
-    parser.add_argument(
-        "--hf-token",
-        type=str,
-        required=False,
-        default=None,
-        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
-    )
-    parser.add_argument(
-        "--meta-url",
-        type=str,
-        required=False,
-        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
-    )
-    parser.add_argument(
-        "--max-parallel",
-        type=int,
-        required=False,
-        default=3,
-        help="Maximum number of concurrent downloads",
-    )
-    parser.add_argument(
-        "--ignore-patterns",
-        type=str,
-        required=False,
-        default="*.safetensors",
-        help="""For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
-safetensors files to avoid downloading duplicate weights.
-""",
-    )
-    parser.add_argument(
-        "--manifest-file",
-        type=str,
-        help="For source=meta, you can download models from a manifest file containing a file => URL mapping",
-        required=False,
-    )
-    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
-
-
-@dataclass
-class DownloadTask:
-    url: str
-    output_file: str
-    total_size: int = 0
-    downloaded_size: int = 0
-    task_id: int | None = None
-    retries: int = 0
-    max_retries: int = 3
-
-
-class DownloadError(Exception):
-    pass
-
-
-class CustomTransferSpeedColumn(TransferSpeedColumn):
-    def render(self, task):
-        if task.finished:
-            return "-"
-        return super().render(task)
-
-
-class ParallelDownloader:
-    def __init__(
-        self,
-        max_concurrent_downloads: int = 3,
-        buffer_size: int = 1024 * 1024,
-        timeout: int = 30,
-    ):
-        self.max_concurrent_downloads = max_concurrent_downloads
-        self.buffer_size = buffer_size
-        self.timeout = timeout
-        self.console = Console()
-        self.progress = Progress(
-            TextColumn("[bold blue]{task.description}"),
-            BarColumn(bar_width=40),
-            "[progress.percentage]{task.percentage:>3.1f}%",
-            DownloadColumn(),
-            CustomTransferSpeedColumn(),
-            TimeRemainingColumn(),
-            console=self.console,
-            expand=True,
-        )
-        self.client_options = {
-            "timeout": httpx.Timeout(timeout),
-            "follow_redirects": True,
-        }
-
-    async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
-        last_exception = None
-        for attempt in range(task.max_retries):
-            try:
-                return await func(*args, **kwargs)
-            except Exception as e:
-                last_exception = e
-                if attempt < task.max_retries - 1:
-                    wait_time = min(30, 2**attempt)  # Cap at 30 seconds
-                    self.console.print(
-                        f"[yellow]Attempt {attempt + 1}/{task.max_retries} failed, "
-                        f"retrying in {wait_time} seconds: {str(e)}[/yellow]"
-                    )
-                    await asyncio.sleep(wait_time)
-                    continue
-        raise last_exception
-
-    async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
-        if task.total_size > 0:
-            self.progress.update(task.task_id, total=task.total_size)
-            return
-
-        async def _get_info():
-            response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
-            response.raise_for_status()
-            return response
-
-        try:
-            response = await self.retry_with_exponential_backoff(task, _get_info)
-
-            task.url = str(response.url)
-            task.total_size = int(response.headers.get("Content-Length", 0))
-
-            if task.total_size == 0:
-                raise DownloadError(
-                    f"Unable to determine file size for {task.output_file}. "
-                    "The server might not support range requests."
-                )
-
-            # Update the progress bar's total size once we know it
-            if task.task_id is not None:
-                self.progress.update(task.task_id, total=task.total_size)
-
-        except httpx.HTTPError as e:
-            self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
-            raise
-
-    def verify_file_integrity(self, task: DownloadTask) -> bool:
-        if not os.path.exists(task.output_file):
-            return False
-        return os.path.getsize(task.output_file) == task.total_size
-
-    async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
-        async def _download_chunk():
-            headers = {"Range": f"bytes={start}-{end}"}
-            async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
-                response.raise_for_status()
-
-                with open(task.output_file, "ab") as file:
-                    file.seek(start)
-                    async for chunk in response.aiter_bytes(self.buffer_size):
-                        file.write(chunk)
-                        task.downloaded_size += len(chunk)
-                        self.progress.update(
-                            task.task_id,
-                            completed=task.downloaded_size,
-                        )
-
-        try:
-            await self.retry_with_exponential_backoff(task, _download_chunk)
-        except Exception as e:
-            raise DownloadError(
-                f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
-            ) from e
-
-    async def prepare_download(self, task: DownloadTask) -> None:
-        output_dir = os.path.dirname(task.output_file)
-        os.makedirs(output_dir, exist_ok=True)
-
-        if os.path.exists(task.output_file):
-            task.downloaded_size = os.path.getsize(task.output_file)
-
-    async def download_file(self, task: DownloadTask) -> None:
-        try:
-            async with httpx.AsyncClient(**self.client_options) as client:
-                await self.get_file_info(client, task)
-
-                # Check if file is already downloaded
-                if os.path.exists(task.output_file):
-                    if self.verify_file_integrity(task):
-                        self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.task_id, completed=task.total_size)
-                        return
-
-                await self.prepare_download(task)
-
-                try:
-                    # Split the remaining download into chunks
-                    chunk_size = 27_000_000_000  # Cloudfront max chunk size
-                    chunks = []
-
-                    current_pos = task.downloaded_size
-                    while current_pos < task.total_size:
-                        chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
-                        chunks.append((current_pos, chunk_end))
-                        current_pos = chunk_end + 1
-
-                    # Download chunks in sequence
-                    for chunk_start, chunk_end in chunks:
-                        await self.download_chunk(client, task, chunk_start, chunk_end)
-
-                except Exception as e:
-                    raise DownloadError(f"Download failed: {str(e)}") from e
-
-        except Exception as e:
-            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
-            raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
-
-    def has_disk_space(self, tasks: list[DownloadTask]) -> bool:
-        try:
-            total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
-            dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
-            free_space = shutil.disk_usage(dir_path).free
-
-            # Add 10% buffer for safety
-            required_space = int(total_remaining_size * 1.1)
-
-            if free_space < required_space:
-                self.console.print(
-                    f"[red]Not enough disk space. Required: {required_space // (1024 * 1024)} MB, "
-                    f"Available: {free_space // (1024 * 1024)} MB[/red]"
-                )
-                return False
-            return True
-
-        except Exception as e:
-            raise DownloadError(f"Failed to check disk space: {str(e)}") from e
-
-    async def download_all(self, tasks: list[DownloadTask]) -> None:
-        if not tasks:
-            raise ValueError("No download tasks provided")
-
-        if not os.environ.get("LLAMA_DOWNLOAD_NO_SPACE_CHECK") and not self.has_disk_space(tasks):
-            raise DownloadError("Insufficient disk space for downloads")
-
-        failed_tasks = []
-
-        with self.progress:
-            for task in tasks:
-                desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
-
-            semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
-
-            async def download_with_semaphore(task: DownloadTask):
-                async with semaphore:
-                    try:
-                        await self.download_file(task)
-                    except Exception as e:
-                        failed_tasks.append((task, str(e)))
-
-            await asyncio.gather(*(download_with_semaphore(task) for task in tasks))
-
-        if failed_tasks:
-            self.console.print("\n[red]Some downloads failed:[/red]")
-            for task, error in failed_tasks:
-                self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
-            raise DownloadError(f"{len(failed_tasks)} downloads failed")
-
-
-def _hf_download(
-    model: "Model",
-    hf_token: str,
-    ignore_patterns: str,
-    parser: argparse.ArgumentParser,
-):
-    from huggingface_hub import snapshot_download
-    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
-
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    repo_id = model.huggingface_repo
-    if repo_id is None:
-        raise ValueError(f"No repo id found for model {model.descriptor()}")
-
-    output_dir = model_local_dir(model.descriptor())
-    os.makedirs(output_dir, exist_ok=True)
-    try:
-        true_output_dir = snapshot_download(
-            repo_id,
-            local_dir=output_dir,
-            ignore_patterns=ignore_patterns,
-            token=hf_token,
-            library_name="llama-stack",
-        )
-    except GatedRepoError:
-        parser.error(
-            "It looks like you are trying to access a gated repository. Please ensure you "
-            "have access to the repository and have provided the proper Hugging Face API token "
-            "using the option `--hf-token` or by running `huggingface-cli login`."
-            "You can find your token by visiting https://huggingface.co/settings/tokens"
-        )
-    except RepositoryNotFoundError:
-        parser.error(f"Repository '{repo_id}' not found on the Hugging Face Hub or incorrect Hugging Face token.")
-    except Exception as e:
-        parser.error(e)
-
-    print(f"\nSuccessfully downloaded model to {true_output_dir}")
-
-
-def _meta_download(
-    model: "Model",
-    model_id: str,
-    meta_url: str,
-    info: "LlamaDownloadInfo",
-    max_concurrent_downloads: int,
-):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    output_dir = Path(model_local_dir(model.descriptor()))
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Create download tasks for each file
-    tasks = []
-    for f in info.files:
-        output_file = str(output_dir / f)
-        url = meta_url.replace("*", f"{info.folder}/{f}")
-        total_size = info.pth_size if "consolidated" in f else 0
-        tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))
-
-    # Initialize and run parallel downloader
-    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
-    asyncio.run(downloader.download_all(tasks))
-
-    cprint(f"\nSuccessfully downloaded model to {output_dir}", color="green", file=sys.stderr)
-    cprint(
-        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
-        file=sys.stderr,
-    )
-    cprint(
-        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
-        color="yellow",
-        file=sys.stderr,
-    )
-
-
-class ModelEntry(BaseModel):
-    model_id: str
-    files: dict[str, str]
-
-    model_config = ConfigDict(protected_namespaces=())
-
-
-class Manifest(BaseModel):
-    models: list[ModelEntry]
-    expires_on: datetime
-
-
-def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    with open(manifest_file) as f:
-        d = json.load(f)
-        manifest = Manifest(**d)
-
-    if datetime.now(UTC) > manifest.expires_on.astimezone(UTC):
-        raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
-
-    console = Console()
-    for entry in manifest.models:
-        console.print(f"[blue]Downloading model {entry.model_id}...[/blue]")
-        output_dir = Path(model_local_dir(entry.model_id))
-        os.makedirs(output_dir, exist_ok=True)
-
-        if any(output_dir.iterdir()):
-            console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")
-
-            while True:
-                resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
-                if resp.lower() in ["restart", "r"]:
-                    shutil.rmtree(output_dir)
-                    os.makedirs(output_dir, exist_ok=True)
-                    break
-                elif resp.lower() in ["continue", "c"]:
-                    console.print("[blue]Continuing download...[/blue]")
-                    break
-                else:
-                    console.print("[red]Invalid response. Please try again.[/red]")
-
-        # Create download tasks for all files in the manifest
-        tasks = [
-            DownloadTask(url=url, output_file=str(output_dir / fname), max_retries=3)
-            for fname, url in entry.files.items()
-        ]
-
-        # Initialize and run parallel downloader
-        downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
-        asyncio.run(downloader.download_all(tasks))
-
-
-def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
-    """Main download command handler"""
-    try:
-        if args.manifest_file:
-            _download_from_manifest(args.manifest_file, args.max_parallel)
-            return
-
-        if args.model_id is None:
-            parser.error("Please provide a model id")
-            return
-
-        # Handle comma-separated model IDs
-        model_ids = [model_id.strip() for model_id in args.model_id.split(",")]
-
-        from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model
-
-        from .model.safety_models import (
-            prompt_guard_download_info_map,
-            prompt_guard_model_sku_map,
-        )
-
-        prompt_guard_model_sku_map = prompt_guard_model_sku_map()
-        prompt_guard_download_info_map = prompt_guard_download_info_map()
-
-        for model_id in model_ids:
-            if model_id in prompt_guard_model_sku_map.keys():
-                model = prompt_guard_model_sku_map[model_id]
-                info = prompt_guard_download_info_map[model_id]
-            else:
-                model = resolve_model(model_id)
-                if model is None:
-                    parser.error(f"Model {model_id} not found")
-                    continue
-                info = llama_meta_net_info(model)
-
-            if args.source == "huggingface":
-                _hf_download(model, args.hf_token, args.ignore_patterns, parser)
-            else:
-                meta_url = args.meta_url or input(
-                    f"Please provide the signed URL for model {model_id} you received via email "
-                    f"after visiting https://www.llama.com/llama-downloads/ "
-                    f"(e.g., https://llama3-1.llamameta.net/*?Policy...): "
-                )
-                if "llamameta.net" not in meta_url:
-                    parser.error("Invalid Meta URL provided")
-                _meta_download(model, model_id, meta_url, info, args.max_parallel)
-
-    except Exception as e:
-        parser.error(f"Download failed: {str(e)}")
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@ -6,11 +6,8 @@

 import argparse

-from .download import Download
-from .model import ModelParser
 from .stack import StackParser
 from .stack.utils import print_subcommand_description
-from .verify_download import VerifyDownload


 class LlamaCLIParser:
@ -30,10 +27,7 @@ class LlamaCLIParser:
        subparsers = self.parser.add_subparsers(title="subcommands")

        # Add sub-commands
-        ModelParser.create(subparsers)
        StackParser.create(subparsers)
-        Download.create(subparsers)
-        VerifyDownload.create(subparsers)

        print_subcommand_description(self.parser, subparsers)

--- a/llama_stack/cli/model/init.py
+++ b/llama_stack/cli/model/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .model import ModelParser  # noqa
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import json
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.models.llama.sku_list import resolve_model
-
-
-class ModelDescribe(Subcommand):
-    """Show details about a model"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "describe",
-            prog="llama model describe",
-            description="Show details about a llama model",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_describe_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model-id",
-            type=str,
-            required=True,
-            help="See `llama model list` or `llama model list --show-all` for the list of available models",
-        )
-
-    def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku_map
-
-        prompt_guard_model_map = prompt_guard_model_sku_map()
-        if args.model_id in prompt_guard_model_map.keys():
-            model = prompt_guard_model_map[args.model_id]
-        else:
-            model = resolve_model(args.model_id)
-
-        if model is None:
-            self.parser.error(
-                f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
-            )
-            return
-
-        headers = [
-            "Model",
-            model.descriptor(),
-        ]
-
-        rows = [
-            ("Hugging Face ID", model.huggingface_repo or "<Not Available>"),
-            ("Description", model.description),
-            ("Context Length", f"{model.max_seq_length // 1024}K tokens"),
-            ("Weights format", model.quantization_format.value),
-            ("Model params.json", json.dumps(model.arch_args, indent=4)),
-        ]
-
-        print_table(
-            rows,
-            headers,
-            separate_rows=True,
-        )
--- a/llama_stack/cli/model/download.py
+++ b/llama_stack/cli/model/download.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelDownload(Subcommand):
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "download",
-            prog="llama model download",
-            description="Download a model from llama.meta.com or Hugging Face Hub",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        from llama_stack.cli.download import setup_download_parser
-
-        setup_download_parser(self.parser)
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -1,119 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import os
-import time
-from pathlib import Path
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
-from llama_stack.models.llama.sku_list import all_registered_models
-
-
-def _get_model_size(model_dir):
-    return sum(f.stat().st_size for f in Path(model_dir).rglob("*") if f.is_file())
-
-
-def _convert_to_model_descriptor(model):
-    for m in all_registered_models():
-        if model == m.descriptor().replace(":", "-"):
-            return str(m.descriptor())
-    return str(model)
-
-
-def _run_model_list_downloaded_cmd() -> None:
-    headers = ["Model", "Size", "Modified Time"]
-
-    rows = []
-    for model in os.listdir(DEFAULT_CHECKPOINT_DIR):
-        abs_path = os.path.join(DEFAULT_CHECKPOINT_DIR, model)
-        space_usage = _get_model_size(abs_path)
-        model_size = f"{space_usage / (1024**3):.2f} GB"
-        modified_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(abs_path)))
-        rows.append(
-            [
-                _convert_to_model_descriptor(model),
-                model_size,
-                modified_time,
-            ]
-        )
-
-    print_table(
-        rows,
-        headers,
-        separate_rows=True,
-    )
-
-
-class ModelList(Subcommand):
-    """List available llama models"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "list",
-            prog="llama model list",
-            description="Show available llama models",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_list_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "--show-all",
-            action="store_true",
-            help="Show all models (not just defaults)",
-        )
-        self.parser.add_argument(
-            "--downloaded",
-            action="store_true",
-            help="List the downloaded models",
-        )
-        self.parser.add_argument(
-            "-s",
-            "--search",
-            type=str,
-            required=False,
-            help="Search for the input string as a substring in the model descriptor(ID)",
-        )
-
-    def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_skus
-
-        if args.downloaded:
-            return _run_model_list_downloaded_cmd()
-
-        headers = [
-            "Model Descriptor(ID)",
-            "Hugging Face Repo",
-            "Context Length",
-        ]
-
-        rows = []
-        for model in all_registered_models() + prompt_guard_model_skus():
-            if not args.show_all and not model.is_featured:
-                continue
-
-            descriptor = model.descriptor()
-            if not args.search or args.search.lower() in descriptor.lower():
-                rows.append(
-                    [
-                        descriptor,
-                        model.huggingface_repo,
-                        f"{model.max_seq_length // 1024}K",
-                    ]
-                )
-        if len(rows) == 0:
-            print(f"Did not find any model matching `{args.search}`.")
-        else:
-            print_table(
-                rows,
-                headers,
-                separate_rows=True,
-            )
--- a/llama_stack/cli/model/model.py
+++ b/llama_stack/cli/model/model.py
@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.model.describe import ModelDescribe
-from llama_stack.cli.model.download import ModelDownload
-from llama_stack.cli.model.list import ModelList
-from llama_stack.cli.model.prompt_format import ModelPromptFormat
-from llama_stack.cli.model.remove import ModelRemove
-from llama_stack.cli.model.verify_download import ModelVerifyDownload
-from llama_stack.cli.stack.utils import print_subcommand_description
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelParser(Subcommand):
-    """Llama cli for model interface apis"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "model",
-            prog="llama model",
-            description="Work with llama models",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        self.parser.set_defaults(func=lambda args: self.parser.print_help())
-
-        subparsers = self.parser.add_subparsers(title="model_subcommands")
-
-        # Add sub-commands
-        ModelDownload.create(subparsers)
-        ModelList.create(subparsers)
-        ModelPromptFormat.create(subparsers)
-        ModelDescribe.create(subparsers)
-        ModelVerifyDownload.create(subparsers)
-        ModelRemove.create(subparsers)
-
-        print_subcommand_description(self.parser, subparsers)
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import textwrap
-from io import StringIO
-from pathlib import Path
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.cli.table import print_table
-from llama_stack.models.llama.sku_types import CoreModelId, ModelFamily, is_multimodal, model_family
-
-ROOT_DIR = Path(__file__).parent.parent.parent
-
-
-class ModelPromptFormat(Subcommand):
-    """Llama model cli for describe a model prompt format (message formats)"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "prompt-format",
-            prog="llama model prompt-format",
-            description="Show llama model message formats",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama model prompt-format <options>
-                """
-            ),
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_template_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model-name",
-            type=str,
-            help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n"
-            "(Run `llama model list` to see a list of valid model names)",
-        )
-        self.parser.add_argument(
-            "-l",
-            "--list",
-            action="store_true",
-            help="List all available models",
-        )
-
-    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
-        import importlib.resources
-
-        # Only Llama 3.1 and 3.2 are supported
-        supported_model_ids = [
-            m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
-        ]
-
-        model_list = [m.value for m in supported_model_ids]
-
-        if args.list:
-            headers = ["Model(s)"]
-            rows = []
-            for m in model_list:
-                rows.append(
-                    [
-                        m,
-                    ]
-                )
-            print_table(
-                rows,
-                headers,
-                separate_rows=True,
-            )
-            return
-
-        try:
-            model_id = CoreModelId(args.model_name)
-        except ValueError:
-            self.parser.error(
-                f"{args.model_name} is not a valid Model. Choose one from the list of valid models. "
-                f"Run `llama model list` to see the valid model names."
-            )
-
-        if model_id not in supported_model_ids:
-            self.parser.error(
-                f"{model_id} is not a valid Model. Choose one from the list of valid models. "
-                f"Run `llama model list` to see the valid model names."
-            )
-
-        llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
-        llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
-        llama_3_2_vision_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "vision_prompt_format.md"
-        if model_family(model_id) == ModelFamily.llama3_1:
-            with importlib.resources.as_file(llama_3_1_file) as f:
-                content = f.open("r").read()
-        elif model_family(model_id) == ModelFamily.llama3_2:
-            if is_multimodal(model_id):
-                with importlib.resources.as_file(llama_3_2_vision_file) as f:
-                    content = f.open("r").read()
-            else:
-                with importlib.resources.as_file(llama_3_2_text_file) as f:
-                    content = f.open("r").read()
-
-        render_markdown_to_pager(content)
-
-
-def render_markdown_to_pager(markdown_content: str):
-    from rich.console import Console
-    from rich.markdown import Markdown
-    from rich.style import Style
-    from rich.text import Text
-
-    class LeftAlignedHeaderMarkdown(Markdown):
-        def parse_header(self, token):
-            level = token.type.count("h")
-            content = Text(token.content)
-            header_style = Style(color="bright_blue", bold=True)
-            header = Text(f"{'#' * level} ", style=header_style) + content
-            self.add_text(header)
-
-    # Render the Markdown
-    md = LeftAlignedHeaderMarkdown(markdown_content)
-
-    # Capture the rendered output
-    output = StringIO()
-    console = Console(file=output, force_terminal=True, width=100)  # Set a fixed width
-    console.print(md)
-    rendered_content = output.getvalue()
-    print(rendered_content)
--- a/llama_stack/cli/model/remove.py
+++ b/llama_stack/cli/model/remove.py
@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import os
-import shutil
-
-from llama_stack.cli.subcommand import Subcommand
-from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
-from llama_stack.models.llama.sku_list import resolve_model
-
-
-class ModelRemove(Subcommand):
-    """Remove the downloaded llama model"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "remove",
-            prog="llama model remove",
-            description="Remove the downloaded llama model",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_remove_cmd)
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model",
-            required=True,
-            help="Specify the llama downloaded model name, see `llama model list --downloaded`",
-        )
-        self.parser.add_argument(
-            "-f",
-            "--force",
-            action="store_true",
-            help="Used to forcefully remove the llama model from the storage without further confirmation",
-        )
-
-    def _run_model_remove_cmd(self, args: argparse.Namespace) -> None:
-        from .safety_models import prompt_guard_model_sku_map
-
-        prompt_guard_model_map = prompt_guard_model_sku_map()
-
-        if args.model in prompt_guard_model_map.keys():
-            model = prompt_guard_model_map[args.model]
-        else:
-            model = resolve_model(args.model)
-
-        model_path = os.path.join(DEFAULT_CHECKPOINT_DIR, args.model.replace(":", "-"))
-
-        if model is None or not os.path.isdir(model_path):
-            print(f"'{args.model}' is not a valid llama model or does not exist.")
-            return
-
-        if args.force:
-            shutil.rmtree(model_path)
-            print(f"{args.model} removed.")
-        else:
-            if input(f"Are you sure you want to remove {args.model}? (y/n): ").strip().lower() == "y":
-                shutil.rmtree(model_path)
-                print(f"{args.model} removed.")
-            else:
-                print("Removal aborted.")
--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@ -1,64 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from llama_stack.models.llama.sku_list import LlamaDownloadInfo
-from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat
-
-
-class PromptGuardModel(BaseModel):
-    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
-
-    model_id: str
-    huggingface_repo: str
-    description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
-    is_featured: bool = False
-    max_seq_length: int = 512
-    is_instruct_model: bool = False
-    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
-    arch_args: dict[str, Any] = Field(default_factory=dict)
-
-    def descriptor(self) -> str:
-        return self.model_id
-
-    model_config = ConfigDict(protected_namespaces=())
-
-
-def prompt_guard_model_skus():
-    return [
-        PromptGuardModel(model_id="Prompt-Guard-86M", huggingface_repo="meta-llama/Prompt-Guard-86M"),
-        PromptGuardModel(
-            model_id="Llama-Prompt-Guard-2-86M",
-            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-86M",
-        ),
-        PromptGuardModel(
-            model_id="Llama-Prompt-Guard-2-22M",
-            huggingface_repo="meta-llama/Llama-Prompt-Guard-2-22M",
-        ),
-    ]
-
-
-def prompt_guard_model_sku_map() -> dict[str, Any]:
-    return {model.model_id: model for model in prompt_guard_model_skus()}
-
-
-def prompt_guard_download_info_map() -> dict[str, LlamaDownloadInfo]:
-    return {
-        model.model_id: LlamaDownloadInfo(
-            folder="Prompt-Guard" if model.model_id == "Prompt-Guard-86M" else model.model_id,
-            files=[
-                "model.safetensors",
-                "special_tokens_map.json",
-                "tokenizer.json",
-                "tokenizer_config.json",
-            ],
-            pth_size=1,
-        )
-        for model in prompt_guard_model_skus()
-    }
--- a/llama_stack/cli/model/verify_download.py
+++ b/llama_stack/cli/model/verify_download.py
@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelVerifyDownload(Subcommand):
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "verify-download",
-            prog="llama model verify-download",
-            description="Verify the downloaded checkpoints' checksums for models downloaded from Meta",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-
-        from llama_stack.cli.verify_download import setup_verify_download_parser
-
-        setup_verify_download_parser(self.parser)
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -439,12 +439,24 @@ def _run_stack_build_command_from_build_config(

        cprint("Build Successful!", color="green", file=sys.stderr)
        cprint(f"You can find the newly-built distribution here: {run_config_file}", color="blue", file=sys.stderr)
-        cprint(
-            "You can run the new Llama Stack distro via: "
-            + colored(f"llama stack run {run_config_file} --image-type {build_config.image_type}", "blue"),
-            color="green",
-            file=sys.stderr,
-        )
+        if build_config.image_type == LlamaStackImageType.VENV:
+            cprint(
+                "You can run the new Llama Stack distro (after activating "
+                + colored(image_name, "cyan")
+                + ") via: "
+                + colored(f"llama stack run {run_config_file}", "blue"),
+                color="green",
+                file=sys.stderr,
+            )
+        elif build_config.image_type == LlamaStackImageType.CONTAINER:
+            cprint(
+                "You can run the container with: "
+                + colored(
+                    f"docker run -p 8321:8321 -v ~/.llama:/root/.llama localhost/{image_name} --port 8321", "blue"
+                ),
+                color="green",
+                file=sys.stderr,
+            )
        return distro_path
    else:
        return _generate_run_config(build_config, build_dir, image_name)
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -6,11 +6,18 @@

 import argparse
 import os
+import ssl
 import subprocess
 from pathlib import Path

+import uvicorn
+import yaml
+
 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
+from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
+from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
+from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.log import get_logger

 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -48,18 +55,12 @@ class StackRun(Subcommand):
            "--image-name",
            type=str,
            default=None,
-            help="Name of the image to run. Defaults to the current environment",
-        )
-        self.parser.add_argument(
-            "--env",
-            action="append",
-            help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
-            metavar="KEY=VALUE",
+            help="[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running.",
        )
        self.parser.add_argument(
            "--image-type",
            type=str,
-            help="Image Type used during the build. This can be only venv.",
+            help="[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running.",
            choices=[e.value for e in ImageType if e.value != ImageType.CONTAINER.value],
        )
        self.parser.add_argument(
@ -68,48 +69,22 @@ class StackRun(Subcommand):
            help="Start the UI server",
        )

-    def _resolve_config_and_distro(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
-        """Resolve config file path and distribution name from args.config"""
-        from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
-
-        if not args.config:
-            return None, None
-
-        config_file = Path(args.config)
-        has_yaml_suffix = args.config.endswith(".yaml")
-        distro_name = None
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if this is a distribution
-            config_file = Path(REPO_ROOT) / "llama_stack" / "distributions" / args.config / "run.yaml"
-            if config_file.exists():
-                distro_name = args.config
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if it's a build config saved to ~/.llama dir
-            config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
-
-        if not config_file.exists():
-            self.parser.error(
-                f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
-            )
-
-        if not config_file.is_file():
-            self.parser.error(
-                f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
-            )
-
-        return config_file, distro_name
-
    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import yaml

        from llama_stack.core.configure import parse_and_maybe_upgrade_config
-        from llama_stack.core.utils.exec import formulate_run_args, run_command
+
+        if args.image_type or args.image_name:
+            self.parser.error(
+                "The --image-type and --image-name flags are no longer supported.\n\n"
+                "Please activate your virtual environment manually before running `llama stack run`.\n\n"
+                "For example:\n"
+                "  source /path/to/venv/bin/activate\n"
+                "  llama stack run <config>\n"
+            )

        if args.enable_ui:
            self._start_ui_development_server(args.port)
-        image_type, image_name = args.image_type, args.image_name

        if args.config:
            try:
@ -121,10 +96,6 @@ class StackRun(Subcommand):
        else:
            config_file = None

-        # Check if config is required based on image type
-        if image_type == ImageType.VENV.value and not config_file:
-            self.parser.error("Config file is required for venv environment")
-
        if config_file:
            logger.info(f"Using run configuration: {config_file}")

@ -139,50 +110,67 @@ class StackRun(Subcommand):
                    os.makedirs(str(config.external_providers_dir), exist_ok=True)
            except AttributeError as e:
                self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
+
+        self._uvicorn_run(config_file, args)
+
+    def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
+        if not config_file:
+            self.parser.error("Config file is required")
+
+        config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
+        with open(config_file) as fp:
+            config_contents = yaml.safe_load(fp)
+            if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+                logger_config = LoggingConfig(**cfg)
+            else:
+                logger_config = None
+            config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
+
+        port = args.port or config.server.port
+        host = config.server.host or ["::", "0.0.0.0"]
+
+        # Set the config file in environment so create_app can find it
+        os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
+
+        uvicorn_config = {
+            "factory": True,
+            "host": host,
+            "port": port,
+            "lifespan": "on",
+            "log_level": logger.getEffectiveLevel(),
+            "log_config": logger_config,
+        }
+
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+        if keyfile and certfile:
+            uvicorn_config["ssl_keyfile"] = config.server.tls_keyfile
+            uvicorn_config["ssl_certfile"] = config.server.tls_certfile
+            if config.server.tls_cafile:
+                uvicorn_config["ssl_ca_certs"] = config.server.tls_cafile
+                uvicorn_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
        else:
-            config = None
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")

-        # If neither image type nor image name is provided, assume the server should be run directly
-        # using the current environment packages.
-        if not image_type and not image_name:
-            logger.info("No image type or image name provided. Assuming environment packages.")
-            from llama_stack.core.server.server import main as server_main
+        logger.info(f"Listening on {host}:{port}")

-            # Build the server args from the current args passed to the CLI
-            server_args = argparse.Namespace()
-            for arg in vars(args):
-                # If this is a function, avoid passing it
-                # "args" contains:
-                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
-                if callable(getattr(args, arg)):
-                    continue
-                if arg == "config":
-                    server_args.config = str(config_file)
-                else:
-                    setattr(server_args, arg, getattr(args, arg))
-
-            # Run the server
-            server_main(server_args)
-        else:
-            run_args = formulate_run_args(image_type, image_name)
-
-            run_args.extend([str(args.port)])
-
-            if config_file:
-                run_args.extend(["--config", str(config_file)])
-
-            if args.env:
-                for env_var in args.env:
-                    if "=" not in env_var:
-                        self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
-                        return
-                    key, value = env_var.split("=", 1)  # split on first = only
-                    if not key:
-                        self.parser.error(f"Environment variable '{env_var}' has empty key")
-                        return
-                    run_args.extend(["--env", f"{key}={value}"])
-
-            run_command(run_args)
+        # We need to catch KeyboardInterrupt because uvicorn's signal handling
+        # re-raises SIGINT signals using signal.raise_signal(), which Python
+        # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
+        # stack trace when using Ctrl+C or kill -2 (SIGINT).
+        # SIGTERM (kill -15) works fine without this because Python doesn't
+        # have a default handler for it.
+        #
+        # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
+        # signal handling but this is quite intrusive and not worth the effort.
+        try:
+            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
+        except (KeyboardInterrupt, SystemExit):
+            logger.info("Received interrupt signal, shutting down gracefully...")

    def _start_ui_development_server(self, stack_server_port: int):
        logger.info("Attempting to start UI development server...")
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -1,141 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import hashlib
-from dataclasses import dataclass
-from functools import partial
-from pathlib import Path
-
-from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-@dataclass
-class VerificationResult:
-    filename: str
-    expected_hash: str
-    actual_hash: str | None
-    exists: bool
-    matches: bool
-
-
-class VerifyDownload(Subcommand):
-    """Llama cli for verifying downloaded model files"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "verify-download",
-            prog="llama verify-download",
-            description="Verify integrity of downloaded model files",
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        setup_verify_download_parser(self.parser)
-
-
-def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--model-id",
-        required=True,
-        help="Model ID to verify (only for models downloaded from Meta)",
-    )
-    parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
-
-
-def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
-    sha256_hash = hashlib.sha256()
-    with open(filepath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
-            sha256_hash.update(chunk)
-    return sha256_hash.hexdigest()
-
-
-def load_checksums(checklist_path: Path) -> dict[str, str]:
-    checksums = {}
-    with open(checklist_path) as f:
-        for line in f:
-            if line.strip():
-                sha256sum, filepath = line.strip().split("  ", 1)
-                # Remove leading './' if present
-                filepath = filepath.lstrip("./")
-                checksums[filepath] = sha256sum
-    return checksums
-
-
-def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -> list[VerificationResult]:
-    results = []
-
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        console=console,
-    ) as progress:
-        for filepath, expected_hash in checksums.items():
-            full_path = model_dir / filepath
-            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
-
-            exists = full_path.exists()
-            actual_hash = None
-            matches = False
-
-            if exists:
-                actual_hash = calculate_sha256(full_path)
-                matches = actual_hash == expected_hash
-
-            results.append(
-                VerificationResult(
-                    filename=filepath,
-                    expected_hash=expected_hash,
-                    actual_hash=actual_hash,
-                    exists=exists,
-                    matches=matches,
-                )
-            )
-
-            progress.remove_task(task_id)
-
-    return results
-
-
-def run_verify_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
-    from llama_stack.core.utils.model_utils import model_local_dir
-
-    console = Console()
-    model_dir = Path(model_local_dir(args.model_id))
-    checklist_path = model_dir / "checklist.chk"
-
-    if not model_dir.exists():
-        parser.error(f"Model directory not found: {model_dir}")
-
-    if not checklist_path.exists():
-        parser.error(f"Checklist file not found: {checklist_path}")
-
-    checksums = load_checksums(checklist_path)
-    results = verify_files(model_dir, checksums, console)
-
-    # Print results
-    console.print("\nVerification Results:")
-
-    all_good = True
-    for result in results:
-        if not result.exists:
-            console.print(f"[red]❌ {result.filename}: File not found[/red]")
-            all_good = False
-        elif not result.matches:
-            console.print(
-                f"[red]❌ {result.filename}: Hash mismatch[/red]\n"
-                f"   Expected: {result.expected_hash}\n"
-                f"   Got:      {result.actual_hash}"
-            )
-            all_good = False
-        else:
-            console.print(f"[green]✓ {result.filename}: Verified[/green]")
-
-    if all_good:
-        console.print("\n[green]All files verified successfully![/green]")
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -324,14 +324,14 @@ fi
 RUN pip uninstall -y uv
 EOF

-# If a run config is provided, we use the --config flag
+# If a run config is provided, we use the llama stack CLI
 if [[ -n "$run_config" ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"]
+ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
 EOF
 elif [[ "$distro_or_config" != *.yaml ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"]
+ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
 EOF
 fi

--- a/llama_stack/core/conversations/conversations.py
+++ b/llama_stack/core/conversations/conversations.py
@ -32,7 +32,7 @@ from llama_stack.providers.utils.sqlstore.sqlstore import (
    sqlstore_impl,
 )

-logger = get_logger(name=__name__, category="openai::conversations")
+logger = get_logger(name=__name__, category="openai_conversations")


 class ConversationServiceConfig(BaseModel):
@ -196,12 +196,15 @@ class ConversationServiceImpl(Conversations):
        await self._get_validated_conversation(conversation_id)

        created_items = []
-        created_at = int(time.time())
+        base_time = int(time.time())

-        for item in items:
+        for i, item in enumerate(items):
            item_dict = item.model_dump()
            item_id = self._get_or_generate_item_id(item, item_dict)

+            # make each timestamp unique to maintain order
+            created_at = base_time + i
+
            item_record = {
                "id": item_id,
                "conversation_id": conversation_id,
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -47,10 +47,6 @@ def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
            routing_table_api=Api.shields,
            router_api=Api.safety,
        ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.vector_dbs,
-            router_api=Api.vector_io,
-        ),
        AutoRoutedApiInfo(
            routing_table_api=Api.datasets,
            router_api=Api.datasetio,
@ -243,6 +239,7 @@ def get_external_providers_from_module(
                    spec = module.get_provider_spec()
                else:
                    # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
+                    # in the case we are building we CANNOT import this module of course because it has not been installed.
                    spec = ProviderSpec(
                        api=Api(provider_api),
                        provider_type=provider.provider_type,
@ -251,9 +248,20 @@ def get_external_providers_from_module(
                        config_class="",
                    )
                provider_type = provider.provider_type
-                # in the case we are building we CANNOT import this module of course because it has not been installed.
-                # return a partially filled out spec that the build script will populate.
-                registry[Api(provider_api)][provider_type] = spec
+                if isinstance(spec, list):
+                    # optionally allow people to pass inline and remote provider specs as a returned list.
+                    # with the old method, users could pass in directories of specs using overlapping code
+                    # we want to ensure we preserve that flexibility in this method.
+                    logger.info(
+                        f"Detected a list of external provider specs from {provider.module} adding all to the registry"
+                    )
+                    for provider_spec in spec:
+                        if provider_spec.provider_type != provider.provider_type:
+                            continue
+                        logger.info(f"Adding {provider.provider_type} to registry")
+                        registry[Api(provider_api)][provider.provider_type] = provider_spec
+                else:
+                    registry[Api(provider_api)][provider_type] = spec
            except ModuleNotFoundError as exc:
                raise ValueError(
                    "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"
--- a/llama_stack/core/id_generation.py
+++ b/llama_stack/core/id_generation.py
@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import Callable
+
+IdFactory = Callable[[], str]
+IdOverride = Callable[[str, IdFactory], str]
+
+_id_override: IdOverride | None = None
+
+
+def generate_object_id(kind: str, factory: IdFactory) -> str:
+    """Generate an identifier for the given kind using the provided factory.
+
+    Allows tests to override ID generation deterministically by installing an
+    override callback via :func:`set_id_override`.
+    """
+
+    override = _id_override
+    if override is not None:
+        return override(kind, factory)
+    return factory()
+
+
+def set_id_override(override: IdOverride) -> IdOverride | None:
+    """Install an override used to generate deterministic identifiers."""
+
+    global _id_override
+
+    previous = _id_override
+    _id_override = override
+    return previous
+
+
+def reset_id_override(previous: IdOverride | None) -> None:
+    """Restore the previous override returned by :func:`set_id_override`."""
+
+    global _id_override
+    _id_override = previous
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -54,6 +54,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
    setup_logger,
    start_trace,
 )
+from llama_stack.strong_typing.inspection import is_unwrapped_body_param

 logger = get_logger(name=__name__, category="core")

@ -383,7 +384,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        body, field_names = self._handle_file_uploads(options, body)

-        body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
+        body = self._convert_body(matched_func, body, exclude_params=set(field_names))

        trace_path = webmethod.descriptive_name or route_path
        await start_trace(trace_path, {"__location__": "library_client"})
@ -446,7 +447,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

-        body = self._convert_body(path, options.method, body)
+        # Prepare body for the function call (handles both Pydantic and traditional params)
+        body = self._convert_body(func, body)

        trace_path = webmethod.descriptive_name or route_path
        await start_trace(trace_path, {"__location__": "library_client"})
@ -493,21 +495,32 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        )
        return await response.parse()

-    def _convert_body(
-        self, path: str, method: str, body: dict | None = None, exclude_params: set[str] | None = None
-    ) -> dict:
+    def _convert_body(self, func: Any, body: dict | None = None, exclude_params: set[str] | None = None) -> dict:
        if not body:
            return {}

-        assert self.route_impls is not None  # Should be guaranteed by request() method, assertion for mypy
        exclude_params = exclude_params or set()
-
-        func, _, _, _ = find_matching_route(method, path, self.route_impls)
        sig = inspect.signature(func)
+        params_list = [p for p in sig.parameters.values() if p.name != "self"]
+        # Flatten if there's a single unwrapped body parameter (BaseModel or Annotated[BaseModel, Body(embed=False)])
+        if len(params_list) == 1:
+            param = params_list[0]
+            param_type = param.annotation
+            if is_unwrapped_body_param(param_type):
+                base_type = get_args(param_type)[0]
+                return {param.name: base_type(**body)}

        # Strip NOT_GIVENs to use the defaults in signature
        body = {k: v for k, v in body.items() if v is not NOT_GIVEN}

+        # Check if there's an unwrapped body parameter among multiple parameters
+        # (e.g., path param + body param like: vector_store_id: str, params: Annotated[Model, Body(...)])
+        unwrapped_body_param = None
+        for param in params_list:
+            if is_unwrapped_body_param(param.annotation):
+                unwrapped_body_param = param
+                break
+
        # Convert parameters to Pydantic models where needed
        converted_body = {}
        for param_name, param in sig.parameters.items():
@ -517,5 +530,11 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                    converted_body[param_name] = value
                else:
                    converted_body[param_name] = convert_to_pydantic(param.annotation, value)
+            elif unwrapped_body_param and param.name == unwrapped_body_param.name:
+                # This is the unwrapped body param - construct it from remaining body keys
+                base_type = get_args(param.annotation)[0]
+                # Extract only the keys that aren't already used by other params
+                remaining_keys = {k: v for k, v in body.items() if k not in converted_body}
+                converted_body[param.name] = base_type(**remaining_keys)

        return converted_body
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -28,7 +28,6 @@ from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
 from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.client import get_client_impl
@ -55,7 +54,6 @@ from llama_stack.providers.datatypes import (
    ScoringFunctionsProtocolPrivate,
    ShieldsProtocolPrivate,
    ToolGroupsProtocolPrivate,
-    VectorDBsProtocolPrivate,
 )

 logger = get_logger(name=__name__, category="core")
@ -81,7 +79,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.inspect: Inspect,
        Api.batches: Batches,
        Api.vector_io: VectorIO,
-        Api.vector_dbs: VectorDBs,
        Api.models: Models,
        Api.safety: Safety,
        Api.shields: Shields,
@ -125,7 +122,6 @@ def additional_protocols_map() -> dict[Api, Any]:
    return {
        Api.inference: (ModelsProtocolPrivate, Models, Api.models),
        Api.tool_groups: (ToolGroupsProtocolPrivate, ToolGroups, Api.tool_groups),
-        Api.vector_io: (VectorDBsProtocolPrivate, VectorDBs, Api.vector_dbs),
        Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields),
        Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets),
        Api.scoring: (
@ -150,6 +146,7 @@ async def resolve_impls(
    provider_registry: ProviderRegistry,
    dist_registry: DistributionRegistry,
    policy: list[AccessRule],
+    internal_impls: dict[Api, Any] | None = None,
 ) -> dict[Api, Any]:
    """
    Resolves provider implementations by:
@ -172,7 +169,7 @@ async def resolve_impls(

    sorted_providers = sort_providers_by_deps(providers_with_specs, run_config)

-    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config, policy)
+    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config, policy, internal_impls)


 def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str, dict[str, ProviderWithSpec]]:
@ -280,9 +277,10 @@ async def instantiate_providers(
    dist_registry: DistributionRegistry,
    run_config: StackRunConfig,
    policy: list[AccessRule],
+    internal_impls: dict[Api, Any] | None = None,
 ) -> dict[Api, Any]:
    """Instantiates providers asynchronously while managing dependencies."""
-    impls: dict[Api, Any] = {}
+    impls: dict[Api, Any] = internal_impls.copy() if internal_impls else {}
    inner_impls_by_provider_id: dict[str, dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
    for api_str, provider in sorted_providers:
        # Skip providers that are not enabled
--- a/llama_stack/core/routers/init.py
+++ b/llama_stack/core/routers/init.py
@ -31,10 +31,8 @@ async def get_routing_table_impl(
    from ..routing_tables.scoring_functions import ScoringFunctionsRoutingTable
    from ..routing_tables.shields import ShieldsRoutingTable
    from ..routing_tables.toolgroups import ToolGroupsRoutingTable
-    from ..routing_tables.vector_dbs import VectorDBsRoutingTable

    api_to_tables = {
-        "vector_dbs": VectorDBsRoutingTable,
        "models": ModelsRoutingTable,
        "shields": ShieldsRoutingTable,
        "datasets": DatasetsRoutingTable,
--- a/Show more
+++ b/Show more