merge

2025-03-23 15:48:14 -07:00 · 2025-03-23 15:48:14 -07:00 · a54d757ade
commit a54d757ade
parent c1d18283d2 b1513e66d5
197 changed files with 9392 additions and 3089 deletions
--- a/llama_stack/templates/bedrock/bedrock.py
+++ b/llama_stack/templates/bedrock/bedrock.py
@ -78,7 +78,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
        },
--- a/llama_stack/templates/bedrock/doc_template.md
+++ b/llama_stack/templates/bedrock/doc_template.md
@ -47,9 +47,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -37,7 +37,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/bedrock/trace_store.db}
  datasetio:
--- a/llama_stack/templates/cerebras/cerebras.py
+++ b/llama_stack/templates/cerebras/cerebras.py
@ -102,7 +102,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "CEREBRAS_API_KEY": (
--- a/llama_stack/templates/cerebras/doc_template.md
+++ b/llama_stack/templates/cerebras/doc_template.md
@ -39,9 +39,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
@ -55,6 +56,6 @@ docker run \
 ```bash
 llama stack build --template cerebras --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port 8321 \
  --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@ -58,7 +58,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/cerebras/trace_store.db}
  tool_runtime:
--- a/llama_stack/templates/ci-tests/ci_tests.py
+++ b/llama_stack/templates/ci-tests/ci_tests.py
@ -108,7 +108,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "FIREWORKS_API_KEY": (
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@ -40,7 +40,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ci-tests/trace_store.db}
  datasetio:
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@ -43,6 +43,7 @@ export CUDA_VISIBLE_DEVICES=0
 export LLAMA_STACK_PORT=8321

 docker run --rm -it \
+  --pull always \
  --network host \
  -v $HOME/.cache/huggingface:/data \
  -e HF_TOKEN=$HF_TOKEN \
@ -66,6 +67,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1

 docker run --rm -it \
+  --pull always \
  --network host \
  -v $HOME/.cache/huggingface:/data \
  -e HF_TOKEN=$HF_TOKEN \
@ -108,6 +110,7 @@ This method allows you to get started quickly without having to build the distri

 ```bash
 docker run -it \
+  --pull always \
  --network host \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
@ -135,6 +138,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
  datasetio:
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -39,7 +39,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db}
  datasetio:
--- a/llama_stack/templates/dev/dev.py
+++ b/llama_stack/templates/dev/dev.py
@ -184,7 +184,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "FIREWORKS_API_KEY": (
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/dev/run.yaml
@ -69,7 +69,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db}
  datasetio:
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@ -28,7 +28,11 @@ providers:
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db
  scoring:
  - provider_id: basic
    provider_type: inline::basic
@ -40,7 +44,11 @@ providers:
  datasetio:
  - provider_id: localfs
    provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/localfs_datasetio.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -58,7 +66,7 @@ providers:
      persistence_store:
        type: sqlite
        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/agents_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -70,7 +78,7 @@ providers:
      kvstore:
        type: sqlite
        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/faiss_store.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
@ -82,7 +90,7 @@ providers:
 metadata_store:
  namespace: null
  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/registry.db
 models: []
 shields: []
 vector_dbs: []
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -49,9 +49,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@ -160,7 +160,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "FIREWORKS_API_KEY": (
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@ -48,7 +48,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
  datasetio:
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db}
  datasetio:
--- a/llama_stack/templates/groq/doc_template.md
+++ b/llama_stack/templates/groq/doc_template.md
@ -49,9 +49,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/templates/groq/groq.py
+++ b/llama_stack/templates/groq/groq.py
@ -95,7 +95,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "GROQ_API_KEY": (
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/groq/trace_store.db}
  datasetio:
--- a/llama_stack/templates/hf-endpoint/hf_endpoint.py
+++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py
@ -125,7 +125,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "HF_API_TOKEN": (
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -48,7 +48,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
  datasetio:
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db}
  datasetio:
--- a/llama_stack/templates/hf-serverless/hf_serverless.py
+++ b/llama_stack/templates/hf-serverless/hf_serverless.py
@ -126,7 +126,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "HF_API_TOKEN": (
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -48,7 +48,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
  datasetio:
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db}
  datasetio:
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@ -65,9 +65,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
@ -80,6 +81,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
@ -95,7 +97,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
 ```bash
 llama stack build --template {{ name }} --image-type conda
 llama stack run distributions/{{ name }}/run.yaml \
-  --port 5001 \
+  --port 8321 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 ```

@ -103,7 +105,7 @@ If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
 llama stack run distributions/{{ name }}/run-with-safety.yaml \
-  --port 5001 \
+  --port 8321 \
  --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
  --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 ```
--- a/llama_stack/templates/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py
@ -132,7 +132,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -50,7 +50,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
  datasetio:
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -44,7 +44,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db}
  datasetio:
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@ -67,9 +67,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
@ -82,6 +83,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
--- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
+++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py
@ -98,7 +98,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@ -46,7 +46,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-quantized-gpu/trace_store.db}
  datasetio:
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@ -39,9 +39,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
@ -55,7 +56,7 @@ docker run \
 ```bash
 llama stack build --template nvidia --image-type conda
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port 8321 \
  --env NVIDIA_API_KEY=$NVIDIA_API_KEY
  --env INFERENCE_MODEL=$INFERENCE_MODEL
 ```
--- a/llama_stack/templates/nvidia/run-with-safety.yaml
+++ b/llama_stack/templates/nvidia/run-with-safety.yaml
@ -46,7 +46,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  datasetio:
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -41,7 +41,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db}
  datasetio:
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -60,9 +60,10 @@ Now you are ready to run Llama Stack with Ollama as the inference provider. You
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-export LLAMA_STACK_PORT=5001
+export LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  llamastack/distribution-{{ name }} \
@ -80,6 +81,7 @@ cd /path/to/llama-stack

 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
@ -96,7 +98,7 @@ docker run \
 Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
-export LLAMA_STACK_PORT=5001
+export LLAMA_STACK_PORT=8321

 llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml \
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -136,7 +136,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "OLLAMA_URL": (
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -41,7 +41,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
  datasetio:
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -39,7 +39,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db}
  datasetio:
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -167,7 +167,6 @@ def get_distribution_template() -> DistributionTemplate:
    default_datasets = [
        DatasetInput(
            dataset_id="simpleqa",
-            provider_id="huggingface",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/simpleqa?split=train",
@ -175,7 +174,6 @@ def get_distribution_template() -> DistributionTemplate:
        ),
        DatasetInput(
            dataset_id="mmlu_cot",
-            provider_id="huggingface",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/mmlu_cot?split=test&name=all",
@ -183,7 +181,6 @@ def get_distribution_template() -> DistributionTemplate:
        ),
        DatasetInput(
            dataset_id="gpqa_cot",
-            provider_id="huggingface",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main",
@ -191,7 +188,6 @@ def get_distribution_template() -> DistributionTemplate:
        ),
        DatasetInput(
            dataset_id="math_500",
-            provider_id="huggingface",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/math_500?split=test",
@ -199,12 +195,25 @@ def get_distribution_template() -> DistributionTemplate:
        ),
        DatasetInput(
            dataset_id="bfcl",
-            provider_id="huggingface",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
            ),
        ),
+        DatasetInput(
+            dataset_id="ifeval",
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://datasets/llamastack/IfEval?split=train",
+            ),
+        ),
+        DatasetInput(
+            dataset_id="docvqa",
+            purpose=DatasetPurpose.eval_messages_answer,
+            source=URIDataSource(
+                uri="huggingface://datasets/llamastack/docvqa?split=val",
+            ),
+        ),
    ]

    # TODO(xiyan): fix this back as registerable resources
@ -234,6 +243,16 @@ def get_distribution_template() -> DistributionTemplate:
    #         dataset_id="bfcl",
    #         grader_ids=["basic::bfcl"],
    #     ),
+    #     BenchmarkInput(
+    #         benchmark_id="meta-reference-ifeval",
+    #         dataset_id="ifeval",
+    #         grader_ids=["basic::ifeval"],
+    #     ),
+    #     BenchmarkInput(
+    #         benchmark_id="meta-reference-docvqa",
+    #         dataset_id="docvqa",
+    #         grader_ids=["basic::docvqa"],
+    #     ),
    # ]

    return DistributionTemplate(
@ -258,7 +277,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "TOGETHER_API_KEY": (
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -66,7 +66,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db}
  datasetio:
@ -143,28 +142,24 @@ datasets:
    uri: huggingface://datasets/llamastack/simpleqa?split=train
  metadata: {}
  dataset_id: simpleqa
-  provider_id: huggingface
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/mmlu_cot?split=test&name=all
  metadata: {}
  dataset_id: mmlu_cot
-  provider_id: huggingface
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main
  metadata: {}
  dataset_id: gpqa_cot
-  provider_id: huggingface
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/math_500?split=test
  metadata: {}
  dataset_id: math_500
-  provider_id: huggingface
 - purpose: eval/messages-answer
  source:
    type: uri
--- a/llama_stack/templates/passthrough/passthrough.py
+++ b/llama_stack/templates/passthrough/passthrough.py
@ -181,7 +181,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "PASSTHROUGH_API_KEY": (
--- a/llama_stack/templates/passthrough/run-with-safety.yaml
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@ -48,7 +48,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
  datasetio:
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
  datasetio:
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -36,6 +36,7 @@ export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0

 docker run \
+    --pull always \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
@ -48,6 +49,8 @@ docker run \
    --port $INFERENCE_PORT
 ```

+Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
+
 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:

 ```bash
@ -56,6 +59,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1

 docker run \
+    --pull always \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
@ -79,10 +83,11 @@ This method allows you to get started quickly without having to build the distri
 ```bash
 export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=5001
+export LLAMA_STACK_PORT=8321

 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
@ -104,6 +109,7 @@ cd /path/to/llama-stack

 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
@ -124,7 +130,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
 ```bash
 export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export LLAMA_STACK_PORT=5001
+export LLAMA_STACK_PORT=8321

 cd distributions/remote-vllm
 llama stack build --template remote-vllm --image-type conda
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -13,7 +13,7 @@ providers:
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL}
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
      api_token: ${env.VLLM_API_TOKEN:fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:true}
@ -67,7 +67,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
  tool_runtime:
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -13,7 +13,7 @@ providers:
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL}
+      url: ${env.VLLM_URL:http://localhost:8000/v1}
      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
      api_token: ${env.VLLM_API_TOKEN:fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:true}
@ -60,7 +60,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
  tool_runtime:
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -43,7 +43,7 @@ def get_distribution_template() -> DistributionTemplate:
        provider_id="vllm-inference",
        provider_type="remote::vllm",
        config=VLLMInferenceAdapterConfig.sample_run_config(
-            url="${env.VLLM_URL}",
+            url="${env.VLLM_URL:http://localhost:8000/v1}",
        ),
    )
    embedding_provider = Provider(
@ -133,7 +133,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
--- a/llama_stack/templates/sambanova/doc_template.md
+++ b/llama_stack/templates/sambanova/doc_template.md
@ -49,9 +49,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@ -51,7 +51,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/sambanova/trace_store.db}
  tool_runtime:
--- a/llama_stack/templates/sambanova/sambanova.py
+++ b/llama_stack/templates/sambanova/sambanova.py
@ -6,17 +6,19 @@

 from pathlib import Path

-from llama_stack.distribution.datatypes import (
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
+from llama_stack.distribution.datatypes import Provider, ShieldInput, ToolGroupInput
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig
 from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
+from llama_stack.providers.remote.vector_io.pgvector.config import (
+    PGVectorVectorIOConfig,
+)
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+    get_model_registry,
+)


 def get_distribution_template() -> DistributionTemplate:
@ -105,7 +107,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "SAMBANOVA_API_KEY": (
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@ -38,6 +38,7 @@ export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0

 docker run --rm -it \
+  --pull always \
  -v $HOME/.cache/huggingface:/data \
  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
@ -58,6 +59,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1

 docker run --rm -it \
+  --pull always \
  -v $HOME/.cache/huggingface:/data \
  -p $SAFETY_PORT:$SAFETY_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
@ -78,9 +80,10 @@ Now you are ready to run Llama Stack with TGI as the inference provider. You can
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
@ -97,6 +100,7 @@ cd /path/to/llama-stack

 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
  datasetio:
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -42,7 +42,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db}
  datasetio:
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -127,7 +127,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@ -49,9 +49,10 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+LLAMA_STACK_PORT=8321
 docker run \
  -it \
+  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  llamastack/distribution-{{ name }} \
  --port $LLAMA_STACK_PORT \
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@ -48,7 +48,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
  datasetio:
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -43,7 +43,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db}
  datasetio:
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@ -156,7 +156,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "TOGETHER_API_KEY": (
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -47,7 +47,6 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
  datasetio:
--- a/llama_stack/templates/vllm-gpu/vllm.py
+++ b/llama_stack/templates/vllm-gpu/vllm.py
@ -98,7 +98,7 @@ def get_distribution_template() -> DistributionTemplate:
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
-                "5001",
+                "8321",
                "Port for the Llama Stack distribution server",
            ),
            "INFERENCE_MODEL": (