Merge branch 'main' into fiddlecube-guard

2025-08-07 02:58:21 +00:00 · 2025-02-10 18:14:45 -08:00 · 2025-02-10 18:14:45 -08:00 · 42d6e7e4a1
commit 42d6e7e4a1
parent 24c8824535 3856927ee8
69 changed files with 721 additions and 367 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -23,3 +23,7 @@ jobs:
            .pre-commit-config.yaml
      - uses: pre-commit/action@v3.0.1
      - name: Verify if there are any diff files after pre-commit
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -54,7 +54,7 @@ jobs:
          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT"
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
      - name: Output reports to the job summary
        if: always()
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -48,6 +48,7 @@ repos:
    hooks:
    -   id: uv-export
        args: ["--frozen", "--no-hashes", "--no-emit-project"]
    -   id: uv-sync
 # -   repo: https://github.com/pre-commit/mirrors-mypy
 #     rev: v1.14.0
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,44 +0,0 @@
 # Changelog
 ## 0.2.0
 ### Added
 ### Changed
 ### Removed
 ## 0.0.53
 ### Added
 - Resource-oriented design for models, shields, memory banks, datasets and eval tasks
 - Persistence for registered objects with distribution
 - Ability to persist memory banks created for FAISS
 - PostgreSQL KVStore implementation
 - Environment variable placeholder support in run.yaml files
 - Comprehensive Zero-to-Hero notebooks and quickstart guides
 - Support for quantized models in Ollama
 - Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
 - Bedrock distribution with safety shields support
 - Evals API with task registration and scoring functions
 - MMLU and SimpleQA benchmark scoring functions
 - Huggingface dataset provider integration for benchmarks
 - Support for custom dataset registration from local paths
 - Benchmark evaluation CLI tools with visualization tables
 - RAG evaluation scoring functions and metrics
 - Local persistence for datasets and eval tasks
 ### Changed
 - Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
 - Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
 - Updated API signatures for dataset and eval task registration
 - Restructured folder organization for providers
 - Enhanced Docker build configuration
 - Added version prefixing for REST API routes
 - Enhanced evaluation task registration workflow
 - Improved benchmark evaluation output formatting
 - Restructured evals folder organization for better modularity
 ### Removed
 - `llama stack configure` command
--- a/README.md
+++ b/README.md
@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions to developers started easily,
-|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
-|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
-|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
-|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
-|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
-|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
-|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
-|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
-|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
-|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
-|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
-| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
-|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
-|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
-|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
-|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
 ### Distributions
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -69,6 +69,40 @@
  "fiddlecube": [
    "httpx"
  ],
  "dell": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "fireworks": [
    "aiosqlite",
    "autoevals",
@ -255,6 +289,38 @@
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "nvidia": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
@ -322,6 +388,36 @@
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "sambanova": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "tgi": [
    "aiohttp",
    "aiosqlite",
@ -424,101 +520,5 @@
    "vllm",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "nvidia": [
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "mcp",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "sambanova": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
  "dell": [
    "aiohttp",
    "aiosqlite",
    "autoevals",
    "blobfile",
    "chardet",
    "chromadb-client",
    "datasets",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "opentelemetry-exporter-otlp-proto-http",
    "opentelemetry-sdk",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "requests",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -0,0 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 def pytest_collection_modifyitems(items):
    for item in items:
        item.name = item.name.replace(' ', '_') 
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -86,7 +86,6 @@
        "# NBVAL_SKIP\n",
        "\n",
        "!apt-get install -y bubblewrap\n",
        "# install a branch of llama stack\n",
        "import os\n",
        "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
        "!pip install uv\n",
@ -3397,6 +3396,231 @@
        "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
        "pprint(response)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ad077440",
      "metadata": {},
      "source": [
        "## 4. Image Understanding with Llama 3.2\n",
        "\n",
        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
      ]
    },
    {
      "cell_type": "markdown",
      "id": "82e381ec",
      "metadata": {},
      "source": [
        "### 4.1 Setup and helpers\n",
        "\n",
        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "865fc5a8",
      "metadata": {},
      "outputs": [],
      "source": [
        "!pip install llama-stack-client==0.1.0"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "44e05e16",
      "metadata": {},
      "outputs": [],
      "source": [
        "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "469750f7",
      "metadata": {},
      "outputs": [],
      "source": [
        "from PIL import Image\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "def display_image(path):\n",
        "  img = Image.open(path)\n",
        "  plt.imshow(img)\n",
        "  plt.axis('off')\n",
        "  plt.show()\n",
        "\n",
        "display_image(\"Llama_Repo.jpeg\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a2c1e1c2",
      "metadata": {},
      "outputs": [],
      "source": [
        "import base64\n",
        "\n",
        "def encode_image(image_path):\n",
        "    with open(image_path, \"rb\") as image_file:\n",
        "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
        "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
        "        return base64_url"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c565f99e",
      "metadata": {},
      "outputs": [],
      "source": [
        "from llama_stack_client import LlamaStackClient\n",
        "\n",
        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7737cd41",
      "metadata": {},
      "source": [
        "### 4.2 Using Llama Stack Chat API\n",
        "\n",
        "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d7914894",
      "metadata": {},
      "outputs": [],
      "source": [
        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
        "\n",
        "async def run_main(image_path: str, prompt):\n",
        "    client = LlamaStackClient(\n",
        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
        "    )\n",
        "\n",
        "    message = {\n",
        "        \"role\": \"user\",\n",
        "        \"content\": [\n",
        "            {\n",
        "                \"type\": \"image\",\n",
        "                \"image\": {\n",
        "                     \"url\": {\n",
        "                          \"uri\": encode_image(image_path)\n",
        "                     }\n",
        "                }\n",
        "            },\n",
        "            {\n",
        "                \"type\": \"text\",\n",
        "                \"text\": prompt,\n",
        "            }\n",
        "        ]\n",
        "    }\n",
        "\n",
        "    response = client.inference.chat_completion(\n",
        "        messages=[message],\n",
        "        model_id=LLAMA32_11B_INSTRUCT,\n",
        "        stream=False,\n",
        "    )\n",
        "\n",
        "    print(response.completion_message.content.lower().strip())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4ee09b97",
      "metadata": {},
      "outputs": [],
      "source": [
        "await run_main(\"Llama_Repo.jpeg\",\n",
        "     \"How many different colors are those llamas?\\\n",
        "     What are those colors?\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e741d7b9",
      "metadata": {},
      "source": [
        "### 4.3 Using Llama Stack Agent API\n",
        "\n",
        "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9a83275",
      "metadata": {},
      "outputs": [],
      "source": [
        "from llama_stack_client.lib.agents.agent import Agent\n",
        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
        "\n",
        "async def run_main(image_path, prompt):\n",
        "    base64_image = encode_image(image_path)\n",
        "\n",
        "    client = LlamaStackClient(\n",
        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
        "    )\n",
        "\n",
        "    agent_config = AgentConfig(\n",
        "        model=LLAMA32_11B_INSTRUCT,\n",
        "        instructions=\"You are a helpful assistant\",\n",
        "        enable_session_persistence=False,\n",
        "    )\n",
        "\n",
        "    agent = Agent(client, agent_config)\n",
        "    session_id = agent.create_session(\"test-session\")\n",
        "\n",
        "    response = agent.create_turn(\n",
        "        messages=[{\n",
        "            \"role\": \"user\",\n",
        "            \"content\": [\n",
        "                {\n",
        "                    \"type\": \"image\",\n",
        "                    \"image\": {\n",
        "                         \"url\": {\n",
        "                              \"uri\": encode_image(image_path)\n",
        "                         }\n",
        "                    }\n",
        "                },\n",
        "                {\n",
        "                    \"type\": \"text\",\n",
        "                    \"text\": prompt,\n",
        "                }\n",
        "            ]\n",
        "        }],\n",
        "        session_id=session_id,\n",
        "    )\n",
        "\n",
        "    for log in EventLogger().log(response):\n",
        "        log.print()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "15d0098b",
      "metadata": {},
      "outputs": [],
      "source": [
        "await run_main(\"Llama_Repo.jpeg\",\n",
        "         \"How many different colors are those llamas?\\\n",
        "         What are those colors?\")"
      ]
    }
  ],
  "metadata": {
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a
 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
-**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 Here are some key topics that will help you build effective agents:
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -36,13 +36,12 @@ chunks = [
        "content": "Your document text here",
        "mime_type": "text/plain",
    },
    ...,
 ]
-client.vector_io.insert(vector_db_id, chunks)
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
-    vector_db_id, query="What do you know about..."
+    vector_db_id=vector_db_id, query="What do you know about..."
 )
 ```
@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert(
 # Query documents
 results = client.tool_runtime.rag_tool.query(
-    vector_db_id=vector_db_id,
+    vector_db_ids=[vector_db_id],
-    query="What do you know about...",
+    content="What do you know about...",
 )
 ```
@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
 ```python
 from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.lib.agents.agent import Agent
 # Configure agent with memory
 agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
+    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    enable_session_persistence=False,
    toolgroups=[
        {
            "name": "builtin::rag",
@ -105,10 +108,10 @@ response = agent.create_turn(
        {"role": "user", "content": "I am providing some documents for reference."}
    ],
    documents=[
-        dict(
+        {
-            content="https://raw.githubusercontent.com/example/doc.rst",
+            "content": "https://raw.githubusercontent.com/example/doc.rst",
-            mime_type="text/plain",
+            "mime_type": "text/plain",
-        )
+        }
    ],
    session_id=session_id,
 )
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Dell Distribution of Llama Stack
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Fireworks Distribution
 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Distribution
 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution
 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Ollama Distribution
 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Remote vLLM Distribution
 ```{toctree}
 :maxdepth: 2
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # SambaNova Distribution
 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # TGI Distribution
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -1,7 +1,7 @@
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
 <!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Together Distribution
 ```{toctree}
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -2,7 +2,7 @@
 ```{admonition} News
 :class: tip
-Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details.
+Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
 ```
 # Llama Stack
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@ -22,9 +22,9 @@ class StackListProviders(Subcommand):
        self.parser.set_defaults(func=self._run_providers_list_cmd)
    def _add_arguments(self):
-        from llama_stack.distribution.datatypes import Api
+        from llama_stack.distribution.distribution import providable_apis
-        api_values = [a.value for a in Api]
+        api_values = [api.value for api in providable_apis()]
        self.parser.add_argument(
            "api",
            type=str,
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -55,6 +55,16 @@ class StackRun(Subcommand):
            default=[],
            metavar="KEY=VALUE",
        )
        self.parser.add_argument(
            "--tls-keyfile",
            type=str,
            help="Path to TLS key file for HTTPS",
        )
        self.parser.add_argument(
            "--tls-certfile",
            type=str,
            help="Path to TLS certificate file for HTTPS",
        )
    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import importlib.resources
@ -178,4 +188,7 @@ class StackRun(Subcommand):
                return
            run_args.extend(["--env", f"{key}={value}"])
        if args.tls_keyfile and args.tls_certfile:
            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
        run_with_pty(run_args)
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -117,6 +117,23 @@ class Provider(BaseModel):
    config: Dict[str, Any]
 class ServerConfig(BaseModel):
    port: int = Field(
        default=8321,
        description="Port to listen on",
        ge=1024,
        le=65535,
    )
    tls_certfile: Optional[str] = Field(
        default=None,
        description="Path to TLS certificate file for HTTPS",
    )
    tls_keyfile: Optional[str] = Field(
        default=None,
        description="Path to TLS key file for HTTPS",
    )
 class StackRunConfig(BaseModel):
    version: str = LLAMA_STACK_RUN_CONFIG_VERSION
@ -159,6 +176,11 @@ a default SQLite store will be used.""",
    eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
    tool_groups: List[ToolGroupInput] = Field(default_factory=list)
    server: ServerConfig = Field(
        default_factory=ServerConfig,
        description="Configuration for the HTTP(S) server",
    )
 class BuildConfig(BaseModel):
    version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -17,17 +17,6 @@ from typing import Any, get_args, get_origin, Optional, TypeVar
 import httpx
 import yaml
 from llama_stack_client import (
    APIResponse,
    AsyncAPIResponse,
    AsyncLlamaStackClient,
    AsyncStream,
    LlamaStackClient,
    NOT_GIVEN,
 )
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
@ -46,6 +35,17 @@ from llama_stack.providers.utils.telemetry.tracing import (
    setup_logger,
    start_trace,
 )
 from llama_stack_client import (
    APIResponse,
    AsyncAPIResponse,
    AsyncLlamaStackClient,
    AsyncStream,
    LlamaStackClient,
    NOT_GIVEN,
 )
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
 T = TypeVar("T")
@ -198,6 +198,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
    async def initialize(self) -> bool:
        try:
            self.endpoint_impls = None
            self.impls = await construct_stack(self.config, self.custom_provider_registry)
        except ModuleNotFoundError as _e:
            cprint(_e.msg, "red")
@ -213,7 +214,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                    f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
                    "yellow",
                )
-            return False
+            raise _e
        if Api.telemetry in self.impls:
            setup_logger(self.impls[Api.telemetry])
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -282,8 +282,19 @@ def main():
        action="append",
        help="Environment variables in KEY=value format. Can be specified multiple times.",
    )
    parser.add_argument(
        "--tls-keyfile",
        help="Path to TLS key file for HTTPS",
        required="--tls-certfile" in sys.argv,
    )
    parser.add_argument(
        "--tls-certfile",
        help="Path to TLS certificate file for HTTPS",
        required="--tls-keyfile" in sys.argv,
    )
    args = parser.parse_args()
    if args.env:
        for env_pair in args.env:
            try:
@ -381,11 +392,36 @@ def main():
    import uvicorn
-    # FYI this does not do hot-reloads
+    # Configure SSL if certificates are provided
    port = args.port or config.server.port
    ssl_config = None
    if args.tls_keyfile:
        keyfile = args.tls_keyfile
        certfile = args.tls_certfile
    else:
        keyfile = config.server.tls_keyfile
        certfile = config.server.tls_certfile
    if keyfile and certfile:
        ssl_config = {
            "ssl_keyfile": keyfile,
            "ssl_certfile": certfile,
        }
        print(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
    listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
-    print(f"Listening on {listen_host}:{args.port}")
+    print(f"Listening on {listen_host}:{port}")
-    uvicorn.run(app, host=listen_host, port=args.port)
+
    uvicorn_config = {
        "app": app,
        "host": listen_host,
        "port": port,
    }
    if ssl_config:
        uvicorn_config.update(ssl_config)
    uvicorn.run(**uvicorn_config)
 def extract_path_params(route: str) -> List[str]:
--- a/llama_stack/distribution/start_conda_env.sh
+++ b/llama_stack/distribution/start_conda_env.sh
@ -34,6 +34,7 @@ shift
 # Process environment variables from --env arguments
 env_vars=""
 other_args=""
 while [[ $# -gt 0 ]]; do
  case "$1" in
  --env)
@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do
    fi
    ;;
  *)
    other_args="$other_args $1"
    shift
    ;;
  esac
@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \
  -m llama_stack.distribution.server.server \
  --yaml-config "$yaml_config" \
  --port "$port" \
-  $env_vars
+  $env_vars \
  $other_args
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@ -40,8 +40,12 @@ shift
 port="$1"
 shift
 # Initialize other_args
 other_args=""
 # Process environment variables from --env arguments
 env_vars=""
 while [[ $# -gt 0 ]]; do
    case "$1" in
        --env)
@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do
            fi
            ;;
        *)
            other_args="$other_args $1"
            shift
            ;;
    esac
@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
  -v "$yaml_config:/app/config.yaml" \
  $mounts \
  --env LLAMA_STACK_PORT=$port \
-  --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \
+  --entrypoint python \
-  $container_image:$version_tag
+  $container_image:$version_tag \
  -m llama_stack.distribution.server.server \
  --yaml-config /app/config.yaml \
  $other_args
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str:
@dataclass
 class CodeExecutionContext:
    matplotlib_dump_dir: str
    use_proxy: bool = False
@dataclass
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -26,6 +26,7 @@ from llama_stack.apis.inference import (
    Message,
    ResponseFormat,
    ToolChoice,
    ToolConfig,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -352,24 +352,20 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        return EmbeddingsResponse(embeddings=embeddings)
    async def register_model(self, model: Model) -> Model:
-        # ollama does not have embedding models running. Check if the model is in list of available models.
+        async def check_model_availability(model_id: str):
-        if model.model_type == ModelType.embedding:
+            response = await self.client.ps()
            response = await self.client.list()
            available_models = [m["model"] for m in response["models"]]
-            if model.provider_resource_id not in available_models:
+            if model_id not in available_models:
                raise ValueError(
-                    f"Model '{model.provider_resource_id}' is not available in Ollama. "
+                    f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
                    f"Available models: {', '.join(available_models)}"
                )
        if model.model_type == ModelType.embedding:
            await check_model_availability(model.provider_resource_id)
            return model
        model = await self.register_helper.register_model(model)
-        models = await self.client.ps()
+        await check_model_availability(model.provider_resource_id)
        available_models = [m["model"] for m in models["models"]]
        if model.provider_resource_id not in available_models:
            raise ValueError(
                f"Model '{model.provider_resource_id}' is not available in Ollama. "
                f"Available models: {', '.join(available_models)}"
            )
        return model
--- a/llama_stack/providers/remote/vector_io/qdrant/init.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/init.py
@ -12,8 +12,8 @@ from .config import QdrantConfig
 async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]):
-    from .qdrant import QdrantVectorMemoryAdapter
+    from .qdrant import QdrantVectorDBAdapter
-    impl = QdrantVectorMemoryAdapter(config, deps[Api.inference])
+    impl = QdrantVectorDBAdapter(config, deps[Api.inference])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -55,7 +55,7 @@ class QdrantIndex(EmbeddingIndex):
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            chunk_id = f"{chunk.document_id}:chunk-{i}"
+            chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}"
            points.append(
                PointStruct(
                    id=convert_id(chunk_id),
@ -93,6 +93,9 @@ class QdrantIndex(EmbeddingIndex):
        return QueryChunksResponse(chunks=chunks, scores=scores)
    async def delete(self):
        await self.client.delete_collection(collection_name=self.collection_name)
 class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None:
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ b/llama_stack/providers/tests/datasetio/test_datasetio.py
@ -95,7 +95,7 @@ class TestDatasetIO:
        assert len(response) == 1
        assert response[0].identifier == "test_dataset"
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            # unregister a dataset that does not exist
            await datasets_impl.unregister_dataset("test_dataset2")
@ -104,7 +104,7 @@ class TestDatasetIO:
        assert isinstance(response, list)
        assert len(response) == 0
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            await datasets_impl.unregister_dataset("test_dataset")
    @pytest.mark.asyncio
--- a/llama_stack/providers/tests/inference/test_model_registration.py
+++ b/llama_stack/providers/tests/inference/test_model_registration.py
@ -32,7 +32,7 @@ class TestModelRegistration:
            )
        # Try to register a model that's too large for local inference
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="Llama3.1-70B-Instruct",
            )
@ -42,7 +42,7 @@ class TestModelRegistration:
        _, models_impl = inference_stack
        # Try to register a non-existent model
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="Llama3-NonExistent-Model",
            )
@ -59,7 +59,7 @@ class TestModelRegistration:
            },
        )
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="custom-model-2",
                metadata={
@ -88,7 +88,7 @@ class TestModelRegistration:
    async def test_register_with_invalid_llama_model(self, inference_stack):
        _, models_impl = inference_stack
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="custom-model-2",
                metadata={"llama_model": "invalid-llama-model"},
--- a/llama_stack/providers/tests/inference/test_vision_inference.py
+++ b/llama_stack/providers/tests/inference/test_vision_inference.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 from pathlib import Path
 import pytest
-from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL
+from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
@ -23,7 +23,7 @@ from .utils import group_chunks
 THIS_DIR = Path(__file__).parent
 with open(THIS_DIR / "pasta.jpeg", "rb") as f:
-    PASTA_IMAGE = f.read()
+    PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8")
 class TestVisionModelInference:
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]:
    if not templates_dir.exists():
        raise FileNotFoundError(f"Templates directory not found: {templates_dir}")
-    return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
+    return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
 def process_template(template_dir: Path, progress) -> None:
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -116,3 +116,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -107,3 +107,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@ -172,3 +172,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -161,3 +161,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -124,3 +124,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -124,3 +124,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -147,3 +147,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-{%- if run_config_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -121,3 +121,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -110,3 +110,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -131,8 +131,15 @@ class DistributionTemplate(BaseModel):
            providers_str = ", ".join(f"`{p}`" for p in providers)
            providers_table += f"| {api} | {providers_str} |\n"
-        template = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
+        template = self.template_path.read_text()
-        template += self.template_path.read_text()
+        comment = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
        orphantext = "---\norphan: true\n---\n"
        if template.startswith(orphantext):
            template = template.replace(orphantext, orphantext + comment)
        else:
            template = comment + template
        # Render template with rich-generated table
        env = jinja2.Environment(
            trim_blocks=True,
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -113,3 +113,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@ -167,3 +167,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -156,3 +156,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
 server:
  port: 8321
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "llama_stack"
-version = "0.1.1"
+version = "0.1.2"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -25,8 +25,8 @@ dependencies = [
    "fire",
    "httpx",
    "huggingface-hub",
-    "llama-models>=0.1.1",
+    "llama-models>=0.1.2",
-    "llama-stack-client>=0.1.1",
+    "llama-stack-client>=0.1.2",
    "prompt-toolkit",
    "python-dotenv",
    "pydantic>=2",
--- a/requirements.txt
+++ b/requirements.txt
@ -4,6 +4,7 @@ annotated-types==0.7.0
 anyio==4.8.0
 blobfile==3.0.0
 certifi==2025.1.31
 chardet==5.2.0
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
@ -18,8 +19,8 @@ httpx==0.28.1
 huggingface-hub==0.28.1
 idna==3.10
 jinja2==3.1.5
-llama-models==0.1.1
+llama-models==0.1.2
-llama-stack-client==0.1.1
+llama-stack-client==0.1.2
 lxml==5.3.0
 markdown-it-py==3.0.0
 markupsafe==3.0.2
@ -34,6 +35,7 @@ pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
 pypdf==5.2.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
--- a/tests/client-sdk/README.md
+++ b/tests/client-sdk/README.md
@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L
 To test on a Llama Stack library with certain configuration, run
 ```bash
 LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 or just the template name
 ```bash
 LLAMA_STACK_CONFIG=together
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 To test on a Llama Stack endpoint, run
 ```bash
 LLAMA_STACK_BASE_URL=http//localhost:8089
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference
 ```
 ## Report Generation
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@ -263,12 +263,14 @@ def test_custom_tool(llama_stack_client, agent_config):
    assert "CustomTool" in logs_str
-def test_override_system_message_behavior(llama_stack_client, agent_config):
+# TODO: fix this flaky test
 def xtest_override_system_message_behavior(llama_stack_client, agent_config):
    client_tool = TestClientTool()
    agent_config = {
        **agent_config,
        "instructions": "You are a pirate",
        "client_tools": [client_tool.get_tool_definition()],
        "model": "meta-llama/Llama-3.2-3B-Instruct",
    }
    agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,))
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@ -4,9 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 import pathlib
 import pytest
 from pydantic import BaseModel
@ -14,6 +11,7 @@ PROVIDER_TOOL_PROMPT_FORMAT = {
    "remote::ollama": "json",
    "remote::together": "json",
    "remote::fireworks": "json",
    "remote::vllm": "json",
 }
 PROVIDER_LOGPROBS_TOP_K = set(
@ -56,23 +54,6 @@ def get_weather_tool_definition():
    }
@pytest.fixture
 def image_path():
    return pathlib.Path(__file__).parent / "dog.png"
@pytest.fixture
 def base64_image_data(image_path):
    # Convert the image to base64
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
@pytest.fixture
 def base64_image_url(base64_image_data, image_path):
    # suffix includes the ., so we remove it
    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
 def test_text_completion_non_streaming(llama_stack_client, text_model_id):
    response = llama_stack_client.inference.completion(
        content="Complete the sentence using one word: Roses are red, violets are ",
@ -176,8 +157,8 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in
@pytest.mark.parametrize(
    "question,expected",
    [
-        ("What are the names of planets in our solar system?", "Earth"),
+        ("Which planet do humans live on?", "Earth"),
-        ("What are the names of the planets that have rings around them?", "Saturn"),
+        ("Which planet has rings around it with a name starting with letter S?", "Saturn"),
    ],
 )
 def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected):
@ -299,101 +280,3 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i
    assert answer.last_name == "Jordan"
    assert answer.year_of_birth == 1963
    assert answer.num_seasons_in_nba == 15
 def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": {
                    "url": {
                        # TODO: Replace with Github based URI to resources/sample1.jpg
                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
                    },
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
 def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": {
                    "url": {
                        # TODO: Replace with Github based URI to resources/sample1.jpg
                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
                    },
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=True,
    )
    streamed_content = ""
    for chunk in response:
        streamed_content += chunk.event.delta.text.lower()
    assert len(streamed_content) > 0
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
@pytest.mark.parametrize("type_", ["url", "data"])
 def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
    image_spec = {
        "url": {
            "type": "image",
            "image": {
                "url": {
                    "uri": base64_image_url,
                },
            },
        },
        "data": {
            "type": "image",
            "image": {
                "data": base64_image_data,
            },
        },
    }[type_]
    message = {
        "role": "user",
        "content": [
            image_spec,
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
--- a/tests/client-sdk/inference/test_vision_inference.py
+++ b/tests/client-sdk/inference/test_vision_inference.py
@ -0,0 +1,133 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 import pathlib
 import pytest
@pytest.fixture(scope="session")
 def inference_provider_type(llama_stack_client):
    providers = llama_stack_client.providers.list()
    inference_providers = [p for p in providers if p.api == "inference"]
    assert len(inference_providers) > 0, "No inference providers found"
    return inference_providers[0].provider_type
@pytest.fixture
 def image_path():
    return pathlib.Path(__file__).parent / "dog.png"
@pytest.fixture
 def base64_image_data(image_path):
    # Convert the image to base64
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
@pytest.fixture
 def base64_image_url(base64_image_data, image_path):
    # suffix includes the ., so we remove it
    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
 def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": {
                    "url": {
                        # TODO: Replace with Github based URI to resources/sample1.jpg
                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
                    },
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
 def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": {
                    "url": {
                        # TODO: Replace with Github based URI to resources/sample1.jpg
                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
                    },
                },
            },
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=True,
    )
    streamed_content = ""
    for chunk in response:
        streamed_content += chunk.event.delta.text.lower()
    assert len(streamed_content) > 0
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
@pytest.mark.parametrize("type_", ["url", "data"])
 def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
    image_spec = {
        "url": {
            "type": "image",
            "image": {
                "url": {
                    "uri": base64_image_url,
                },
            },
        },
        "data": {
            "type": "image",
            "image": {
                "data": base64_image_data,
            },
        },
    }[type_]
    message = {
        "role": "user",
        "content": [
            image_spec,
            {
                "type": "text",
                "text": "Describe what is in this image.",
            },
        ],
    }
    response = llama_stack_client.inference.chat_completion(
        model_id=vision_model_id,
        messages=[message],
        stream=False,
    )
    message_content = response.completion_message.content.lower().strip()
    assert len(message_content) > 0
--- a/uv.lock
+++ b/uv.lock
@ -687,7 +687,7 @@ wheels = [
 [[package]]
 name = "llama-models"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2" },
@ -696,14 +696,14 @@ dependencies = [
    { name = "pyyaml" },
    { name = "tiktoken" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 },
+    { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 },
 ]
 [[package]]
 name = "llama-stack"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
    { name = "blobfile" },
@ -751,8 +751,8 @@ requires-dist = [
    { name = "fire" },
    { name = "httpx" },
    { name = "huggingface-hub" },
-    { name = "llama-models", specifier = ">=0.1.1" },
+    { name = "llama-models", specifier = ">=0.1.2" },
-    { name = "llama-stack-client", specifier = ">=0.1.1" },
+    { name = "llama-stack-client", specifier = ">=0.1.2" },
    { name = "myst-parser", marker = "extra == 'docs'" },
    { name = "nbval", marker = "extra == 'dev'" },
    { name = "pre-commit", marker = "extra == 'dev'" },
@ -780,7 +780,7 @@ requires-dist = [
 [[package]]
 name = "llama-stack-client"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -797,9 +797,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 },
+    { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 },
 ]
 [[package]]