Merge branch 'main' into fiddlecube-guard

2025-08-07 02:58:21 +00:00 · 2025-02-10 18:14:45 -08:00 · 2025-02-10 18:14:45 -08:00 · 42d6e7e4a1
commit 42d6e7e4a1
parent 24c8824535 3856927ee8
69 changed files with 721 additions and 367 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -23,3 +23,7 @@ jobs:
            .pre-commit-config.yaml

      - uses: pre-commit/action@v3.0.1
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -54,7 +54,7 @@ jobs:
          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"

          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT"
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"

      - name: Output reports to the job summary
        if: always()
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -48,6 +48,7 @@ repos:
    hooks:
    -   id: uv-export
        args: ["--frozen", "--no-hashes", "--no-emit-project"]
+    -   id: uv-sync

 # -   repo: https://github.com/pre-commit/mirrors-mypy
 #     rev: v1.14.0
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,44 +0,0 @@
-# Changelog
-
-## 0.2.0
-
-### Added
-
-### Changed
-
-### Removed
-
-
-## 0.0.53
-
-### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
-
-### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
-
-### Removed
- `llama stack configure` command
--- a/README.md
+++ b/README.md
@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions to developers started easily,

-|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
-|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
-|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
-|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
-|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
-|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |

 ### Distributions

--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -69,6 +69,40 @@
  "fiddlecube": [
    "httpx"
  ],
+  "dell": [
+    "aiohttp",
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "fireworks": [
    "aiosqlite",
    "autoevals",
@ -255,6 +289,38 @@
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "nvidia": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
@ -322,6 +388,36 @@
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ],
+  "sambanova": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
  "tgi": [
    "aiohttp",
    "aiosqlite",
@ -424,101 +520,5 @@
    "vllm",
    "sentence-transformers --no-deps",
    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "nvidia": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "sambanova": [
-    "aiosqlite",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
-  "dell": [
-    "aiohttp",
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "faiss-cpu",
-    "fastapi",
-    "fire",
-    "httpx",
-    "huggingface_hub",
-    "matplotlib",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        item.name = item.name.replace(' ', '_') 
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -86,7 +86,6 @@
        "# NBVAL_SKIP\n",
        "\n",
        "!apt-get install -y bubblewrap\n",
-        "# install a branch of llama stack\n",
        "import os\n",
        "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
        "!pip install uv\n",
@ -3397,6 +3396,231 @@
        "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
        "pprint(response)\n"
      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ad077440",
+      "metadata": {},
+      "source": [
+        "## 4. Image Understanding with Llama 3.2\n",
+        "\n",
+        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "82e381ec",
+      "metadata": {},
+      "source": [
+        "### 4.1 Setup and helpers\n",
+        "\n",
+        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "865fc5a8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install llama-stack-client==0.1.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "44e05e16",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "469750f7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "def display_image(path):\n",
+        "  img = Image.open(path)\n",
+        "  plt.imshow(img)\n",
+        "  plt.axis('off')\n",
+        "  plt.show()\n",
+        "\n",
+        "display_image(\"Llama_Repo.jpeg\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a2c1e1c2",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "\n",
+        "def encode_image(image_path):\n",
+        "    with open(image_path, \"rb\") as image_file:\n",
+        "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+        "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
+        "        return base64_url"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c565f99e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client import LlamaStackClient\n",
+        "\n",
+        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
+        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7737cd41",
+      "metadata": {},
+      "source": [
+        "### 4.2 Using Llama Stack Chat API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d7914894",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
+        "\n",
+        "async def run_main(image_path: str, prompt):\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    message = {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\n",
+        "                \"type\": \"image\",\n",
+        "                \"image\": {\n",
+        "                     \"url\": {\n",
+        "                          \"uri\": encode_image(image_path)\n",
+        "                     }\n",
+        "                }\n",
+        "            },\n",
+        "            {\n",
+        "                \"type\": \"text\",\n",
+        "                \"text\": prompt,\n",
+        "            }\n",
+        "        ]\n",
+        "    }\n",
+        "\n",
+        "    response = client.inference.chat_completion(\n",
+        "        messages=[message],\n",
+        "        model_id=LLAMA32_11B_INSTRUCT,\n",
+        "        stream=False,\n",
+        "    )\n",
+        "\n",
+        "    print(response.completion_message.content.lower().strip())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4ee09b97",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "     \"How many different colors are those llamas?\\\n",
+        "     What are those colors?\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e741d7b9",
+      "metadata": {},
+      "source": [
+        "### 4.3 Using Llama Stack Agent API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9a83275",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.agents.agent import Agent\n",
+        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+        "\n",
+        "async def run_main(image_path, prompt):\n",
+        "    base64_image = encode_image(image_path)\n",
+        "\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    agent_config = AgentConfig(\n",
+        "        model=LLAMA32_11B_INSTRUCT,\n",
+        "        instructions=\"You are a helpful assistant\",\n",
+        "        enable_session_persistence=False,\n",
+        "    )\n",
+        "\n",
+        "    agent = Agent(client, agent_config)\n",
+        "    session_id = agent.create_session(\"test-session\")\n",
+        "\n",
+        "    response = agent.create_turn(\n",
+        "        messages=[{\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": \"image\",\n",
+        "                    \"image\": {\n",
+        "                         \"url\": {\n",
+        "                              \"uri\": encode_image(image_path)\n",
+        "                         }\n",
+        "                    }\n",
+        "                },\n",
+        "                {\n",
+        "                    \"type\": \"text\",\n",
+        "                    \"text\": prompt,\n",
+        "                }\n",
+        "            ]\n",
+        "        }],\n",
+        "        session_id=session_id,\n",
+        "    )\n",
+        "\n",
+        "    for log in EventLogger().log(response):\n",
+        "        log.print()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "15d0098b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "         \"How many different colors are those llamas?\\\n",
+        "         What are those colors?\")"
+      ]
    }
  ],
  "metadata": {
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a

 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.

-**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)

 Here are some key topics that will help you build effective agents:

--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -36,13 +36,12 @@ chunks = [
        "content": "Your document text here",
        "mime_type": "text/plain",
    },
-    ...,
 ]
-client.vector_io.insert(vector_db_id, chunks)
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)

 # You can then query for these chunks
 chunks_response = client.vector_io.query(
-    vector_db_id, query="What do you know about..."
+    vector_db_id=vector_db_id, query="What do you know about..."
 )
 ```

@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert(

 # Query documents
 results = client.tool_runtime.rag_tool.query(
-    vector_db_id=vector_db_id,
-    query="What do you know about...",
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
 )
 ```

@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:

 ```python
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.lib.agents.agent import Agent
+
 # Configure agent with memory
 agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
+    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions="You are a helpful assistant",
+    enable_session_persistence=False,
    toolgroups=[
        {
            "name": "builtin::rag",
@ -105,10 +108,10 @@ response = agent.create_turn(
        {"role": "user", "content": "I am providing some documents for reference."}
    ],
    documents=[
-        dict(
-            content="https://raw.githubusercontent.com/example/doc.rst",
-            mime_type="text/plain",
-        )
+        {
+            "content": "https://raw.githubusercontent.com/example/doc.rst",
+            "mime_type": "text/plain",
+        }
    ],
    session_id=session_id,
 )
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->

 # Dell Distribution of Llama Stack

--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Fireworks Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Ollama Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Remote vLLM Distribution
 ```{toctree}
 :maxdepth: 2
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # SambaNova Distribution

 ```{toctree}
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->

 # TGI Distribution

--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Together Distribution

 ```{toctree}
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -2,7 +2,7 @@
 ```{admonition} News
 :class: tip

-Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details.
+Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
 ```

 # Llama Stack
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@ -22,9 +22,9 @@ class StackListProviders(Subcommand):
        self.parser.set_defaults(func=self._run_providers_list_cmd)

    def _add_arguments(self):
-        from llama_stack.distribution.datatypes import Api
+        from llama_stack.distribution.distribution import providable_apis

-        api_values = [a.value for a in Api]
+        api_values = [api.value for api in providable_apis()]
        self.parser.add_argument(
            "api",
            type=str,
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -55,6 +55,16 @@ class StackRun(Subcommand):
            default=[],
            metavar="KEY=VALUE",
        )
+        self.parser.add_argument(
+            "--tls-keyfile",
+            type=str,
+            help="Path to TLS key file for HTTPS",
+        )
+        self.parser.add_argument(
+            "--tls-certfile",
+            type=str,
+            help="Path to TLS certificate file for HTTPS",
+        )

    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import importlib.resources
@ -178,4 +188,7 @@ class StackRun(Subcommand):
                return
            run_args.extend(["--env", f"{key}={value}"])

+        if args.tls_keyfile and args.tls_certfile:
+            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+
        run_with_pty(run_args)
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -117,6 +117,23 @@ class Provider(BaseModel):
    config: Dict[str, Any]


+class ServerConfig(BaseModel):
+    port: int = Field(
+        default=8321,
+        description="Port to listen on",
+        ge=1024,
+        le=65535,
+    )
+    tls_certfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS certificate file for HTTPS",
+    )
+    tls_keyfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS key file for HTTPS",
+    )
+
+
 class StackRunConfig(BaseModel):
    version: str = LLAMA_STACK_RUN_CONFIG_VERSION

@ -159,6 +176,11 @@ a default SQLite store will be used.""",
    eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
    tool_groups: List[ToolGroupInput] = Field(default_factory=list)

+    server: ServerConfig = Field(
+        default_factory=ServerConfig,
+        description="Configuration for the HTTP(S) server",
+    )
+

 class BuildConfig(BaseModel):
    version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -17,17 +17,6 @@ from typing import Any, get_args, get_origin, Optional, TypeVar

 import httpx
 import yaml
-from llama_stack_client import (
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-    NOT_GIVEN,
-)
-from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
-from termcolor import cprint

 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
@ -46,6 +35,17 @@ from llama_stack.providers.utils.telemetry.tracing import (
    setup_logger,
    start_trace,
 )
+from llama_stack_client import (
+    APIResponse,
+    AsyncAPIResponse,
+    AsyncLlamaStackClient,
+    AsyncStream,
+    LlamaStackClient,
+    NOT_GIVEN,
+)
+from pydantic import BaseModel, TypeAdapter
+from rich.console import Console
+from termcolor import cprint

 T = TypeVar("T")

@ -198,6 +198,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

    async def initialize(self) -> bool:
        try:
+            self.endpoint_impls = None
            self.impls = await construct_stack(self.config, self.custom_provider_registry)
        except ModuleNotFoundError as _e:
            cprint(_e.msg, "red")
@ -213,7 +214,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                    f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
                    "yellow",
                )
-            return False
+            raise _e

        if Api.telemetry in self.impls:
            setup_logger(self.impls[Api.telemetry])
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -282,8 +282,19 @@ def main():
        action="append",
        help="Environment variables in KEY=value format. Can be specified multiple times.",
    )
+    parser.add_argument(
+        "--tls-keyfile",
+        help="Path to TLS key file for HTTPS",
+        required="--tls-certfile" in sys.argv,
+    )
+    parser.add_argument(
+        "--tls-certfile",
+        help="Path to TLS certificate file for HTTPS",
+        required="--tls-keyfile" in sys.argv,
+    )

    args = parser.parse_args()
+
    if args.env:
        for env_pair in args.env:
            try:
@ -381,11 +392,36 @@ def main():

    import uvicorn

-    # FYI this does not do hot-reloads
+    # Configure SSL if certificates are provided
+    port = args.port or config.server.port
+
+    ssl_config = None
+    if args.tls_keyfile:
+        keyfile = args.tls_keyfile
+        certfile = args.tls_certfile
+    else:
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+
+    if keyfile and certfile:
+        ssl_config = {
+            "ssl_keyfile": keyfile,
+            "ssl_certfile": certfile,
+        }
+        print(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")

    listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
-    print(f"Listening on {listen_host}:{args.port}")
-    uvicorn.run(app, host=listen_host, port=args.port)
+    print(f"Listening on {listen_host}:{port}")
+
+    uvicorn_config = {
+        "app": app,
+        "host": listen_host,
+        "port": port,
+    }
+    if ssl_config:
+        uvicorn_config.update(ssl_config)
+
+    uvicorn.run(**uvicorn_config)


 def extract_path_params(route: str) -> List[str]:
--- a/llama_stack/distribution/start_conda_env.sh
+++ b/llama_stack/distribution/start_conda_env.sh
@ -34,6 +34,7 @@ shift

 # Process environment variables from --env arguments
 env_vars=""
+other_args=""
 while [[ $# -gt 0 ]]; do
  case "$1" in
  --env)
@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do
    fi
    ;;
  *)
+    other_args="$other_args $1"
    shift
    ;;
  esac
@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \
  -m llama_stack.distribution.server.server \
  --yaml-config "$yaml_config" \
  --port "$port" \
-  $env_vars
+  $env_vars \
+  $other_args
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@ -40,8 +40,12 @@ shift
 port="$1"
 shift

+# Initialize other_args
+other_args=""
+
 # Process environment variables from --env arguments
 env_vars=""
+
 while [[ $# -gt 0 ]]; do
    case "$1" in
        --env)
@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do
            fi
            ;;
        *)
+            other_args="$other_args $1"
            shift
            ;;
    esac
@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
  -v "$yaml_config:/app/config.yaml" \
  $mounts \
  --env LLAMA_STACK_PORT=$port \
-  --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \
-  $container_image:$version_tag
+  --entrypoint python \
+  $container_image:$version_tag \
+  -m llama_stack.distribution.server.server \
+  --yaml-config /app/config.yaml \
+  $other_args
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str:
@dataclass
 class CodeExecutionContext:
    matplotlib_dump_dir: str
-    use_proxy: bool = False


@dataclass
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -26,6 +26,7 @@ from llama_stack.apis.inference import (
    Message,
    ResponseFormat,
    ToolChoice,
+    ToolConfig,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -352,24 +352,20 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        return EmbeddingsResponse(embeddings=embeddings)

    async def register_model(self, model: Model) -> Model:
-        # ollama does not have embedding models running. Check if the model is in list of available models.
-        if model.model_type == ModelType.embedding:
-            response = await self.client.list()
+        async def check_model_availability(model_id: str):
+            response = await self.client.ps()
            available_models = [m["model"] for m in response["models"]]
-            if model.provider_resource_id not in available_models:
+            if model_id not in available_models:
                raise ValueError(
-                    f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                    f"Available models: {', '.join(available_models)}"
+                    f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
                )
+
+        if model.model_type == ModelType.embedding:
+            await check_model_availability(model.provider_resource_id)
            return model
+
        model = await self.register_helper.register_model(model)
-        models = await self.client.ps()
-        available_models = [m["model"] for m in models["models"]]
-        if model.provider_resource_id not in available_models:
-            raise ValueError(
-                f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                f"Available models: {', '.join(available_models)}"
-            )
+        await check_model_availability(model.provider_resource_id)

        return model

--- a/llama_stack/providers/remote/vector_io/qdrant/init.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/init.py
@ -12,8 +12,8 @@ from .config import QdrantConfig


 async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]):
-    from .qdrant import QdrantVectorMemoryAdapter
+    from .qdrant import QdrantVectorDBAdapter

-    impl = QdrantVectorMemoryAdapter(config, deps[Api.inference])
+    impl = QdrantVectorDBAdapter(config, deps[Api.inference])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -55,7 +55,7 @@ class QdrantIndex(EmbeddingIndex):

        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            chunk_id = f"{chunk.document_id}:chunk-{i}"
+            chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}"
            points.append(
                PointStruct(
                    id=convert_id(chunk_id),
@ -93,6 +93,9 @@ class QdrantIndex(EmbeddingIndex):

        return QueryChunksResponse(chunks=chunks, scores=scores)

+    async def delete(self):
+        await self.client.delete_collection(collection_name=self.collection_name)
+

 class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None:
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ b/llama_stack/providers/tests/datasetio/test_datasetio.py
@ -95,7 +95,7 @@ class TestDatasetIO:
        assert len(response) == 1
        assert response[0].identifier == "test_dataset"

-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            # unregister a dataset that does not exist
            await datasets_impl.unregister_dataset("test_dataset2")

@ -104,7 +104,7 @@ class TestDatasetIO:
        assert isinstance(response, list)
        assert len(response) == 0

-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            await datasets_impl.unregister_dataset("test_dataset")

    @pytest.mark.asyncio
--- a/llama_stack/providers/tests/inference/test_model_registration.py
+++ b/llama_stack/providers/tests/inference/test_model_registration.py
@ -32,7 +32,7 @@ class TestModelRegistration:
            )

        # Try to register a model that's too large for local inference
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="Llama3.1-70B-Instruct",
            )
@ -42,7 +42,7 @@ class TestModelRegistration:
        _, models_impl = inference_stack

        # Try to register a non-existent model
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="Llama3-NonExistent-Model",
            )
@ -59,7 +59,7 @@ class TestModelRegistration:
            },
        )

-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="custom-model-2",
                metadata={
@ -88,7 +88,7 @@ class TestModelRegistration:
    async def test_register_with_invalid_llama_model(self, inference_stack):
        _, models_impl = inference_stack

-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
            await models_impl.register_model(
                model_id="custom-model-2",
                metadata={"llama_model": "invalid-llama-model"},
--- a/llama_stack/providers/tests/inference/test_vision_inference.py
+++ b/llama_stack/providers/tests/inference/test_vision_inference.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import base64
 from pathlib import Path

 import pytest

-from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL
-
+from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
@ -23,7 +23,7 @@ from .utils import group_chunks
 THIS_DIR = Path(__file__).parent

 with open(THIS_DIR / "pasta.jpeg", "rb") as f:
-    PASTA_IMAGE = f.read()
+    PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8")


 class TestVisionModelInference:
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]:
    if not templates_dir.exists():
        raise FileNotFoundError(f"Templates directory not found: {templates_dir}")

-    return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
+    return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")


 def process_template(template_dir: Path, progress) -> None:
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@ -116,3 +116,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@ -107,3 +107,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@ -172,3 +172,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -161,3 +161,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@ -124,3 +124,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@ -124,3 +124,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@ -147,3 +147,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following

 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.

-{%- if run_config_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables

 The following environment variables can be configured:
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -121,3 +121,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -110,3 +110,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -115,3 +115,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@ -126,3 +126,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -131,8 +131,15 @@ class DistributionTemplate(BaseModel):
            providers_str = ", ".join(f"`{p}`" for p in providers)
            providers_table += f"| {api} | {providers_str} |\n"

-        template = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
-        template += self.template_path.read_text()
+        template = self.template_path.read_text()
+        comment = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
+        orphantext = "---\norphan: true\n---\n"
+
+        if template.startswith(orphantext):
+            template = template.replace(orphantext, orphantext + comment)
+        else:
+            template = comment + template
+
        # Render template with rich-generated table
        env = jinja2.Environment(
            trim_blocks=True,
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -114,3 +114,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -113,3 +113,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@ -167,3 +167,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -156,3 +156,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@ -117,3 +117,5 @@ tool_groups:
  provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
  provider_id: code-interpreter
+server:
+  port: 8321
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "llama_stack"
-version = "0.1.1"
+version = "0.1.2"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -25,8 +25,8 @@ dependencies = [
    "fire",
    "httpx",
    "huggingface-hub",
-    "llama-models>=0.1.1",
-    "llama-stack-client>=0.1.1",
+    "llama-models>=0.1.2",
+    "llama-stack-client>=0.1.2",
    "prompt-toolkit",
    "python-dotenv",
    "pydantic>=2",
--- a/requirements.txt
+++ b/requirements.txt
@ -4,6 +4,7 @@ annotated-types==0.7.0
 anyio==4.8.0
 blobfile==3.0.0
 certifi==2025.1.31
+chardet==5.2.0
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
@ -18,8 +19,8 @@ httpx==0.28.1
 huggingface-hub==0.28.1
 idna==3.10
 jinja2==3.1.5
-llama-models==0.1.1
-llama-stack-client==0.1.1
+llama-models==0.1.2
+llama-stack-client==0.1.2
 lxml==5.3.0
 markdown-it-py==3.0.0
 markupsafe==3.0.2
@ -34,6 +35,7 @@ pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
+pypdf==5.2.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
--- a/tests/client-sdk/README.md
+++ b/tests/client-sdk/README.md
@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L
 To test on a Llama Stack library with certain configuration, run
 ```bash
 LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 or just the template name
 ```bash
 LLAMA_STACK_CONFIG=together
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```

 To test on a Llama Stack endpoint, run
 ```bash
 LLAMA_STACK_BASE_URL=http//localhost:8089
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference
 ```

 ## Report Generation
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@ -263,12 +263,14 @@ def test_custom_tool(llama_stack_client, agent_config):
    assert "CustomTool" in logs_str


-def test_override_system_message_behavior(llama_stack_client, agent_config):
+# TODO: fix this flaky test
+def xtest_override_system_message_behavior(llama_stack_client, agent_config):
    client_tool = TestClientTool()
    agent_config = {
        **agent_config,
        "instructions": "You are a pirate",
        "client_tools": [client_tool.get_tool_definition()],
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
    }

    agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,))
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@ -4,9 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import base64
-import pathlib
-
 import pytest
 from pydantic import BaseModel

@ -14,6 +11,7 @@ PROVIDER_TOOL_PROMPT_FORMAT = {
    "remote::ollama": "json",
    "remote::together": "json",
    "remote::fireworks": "json",
+    "remote::vllm": "json",
 }

 PROVIDER_LOGPROBS_TOP_K = set(
@ -56,23 +54,6 @@ def get_weather_tool_definition():
    }


-@pytest.fixture
-def image_path():
-    return pathlib.Path(__file__).parent / "dog.png"
-
-
-@pytest.fixture
-def base64_image_data(image_path):
-    # Convert the image to base64
-    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
-
-
-@pytest.fixture
-def base64_image_url(base64_image_data, image_path):
-    # suffix includes the ., so we remove it
-    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
-
-
 def test_text_completion_non_streaming(llama_stack_client, text_model_id):
    response = llama_stack_client.inference.completion(
        content="Complete the sentence using one word: Roses are red, violets are ",
@ -176,8 +157,8 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in
@pytest.mark.parametrize(
    "question,expected",
    [
-        ("What are the names of planets in our solar system?", "Earth"),
-        ("What are the names of the planets that have rings around them?", "Saturn"),
+        ("Which planet do humans live on?", "Earth"),
+        ("Which planet has rings around it with a name starting with letter S?", "Saturn"),
    ],
 )
 def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected):
@ -299,101 +280,3 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i
    assert answer.last_name == "Jordan"
    assert answer.year_of_birth == 1963
    assert answer.num_seasons_in_nba == 15
-
-
-def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
-
-
-def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=True,
-    )
-    streamed_content = ""
-    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
-    assert len(streamed_content) > 0
-    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
-
-
-@pytest.mark.parametrize("type_", ["url", "data"])
-def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
-    image_spec = {
-        "url": {
-            "type": "image",
-            "image": {
-                "url": {
-                    "uri": base64_image_url,
-                },
-            },
-        },
-        "data": {
-            "type": "image",
-            "image": {
-                "data": base64_image_data,
-            },
-        },
-    }[type_]
-
-    message = {
-        "role": "user",
-        "content": [
-            image_spec,
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
--- a/tests/client-sdk/inference/test_vision_inference.py
+++ b/tests/client-sdk/inference/test_vision_inference.py
@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def inference_provider_type(llama_stack_client):
+    providers = llama_stack_client.providers.list()
+    inference_providers = [p for p in providers if p.api == "inference"]
+    assert len(inference_providers) > 0, "No inference providers found"
+    return inference_providers[0].provider_type
+
+
+@pytest.fixture
+def image_path():
+    return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+    # Convert the image to base64
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+@pytest.fixture
+def base64_image_url(base64_image_data, image_path):
+    # suffix includes the ., so we remove it
+    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
+
+
+def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=True,
+    )
+    streamed_content = ""
+    for chunk in response:
+        streamed_content += chunk.event.delta.text.lower()
+    assert len(streamed_content) > 0
+    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
+
+
+@pytest.mark.parametrize("type_", ["url", "data"])
+def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
+    image_spec = {
+        "url": {
+            "type": "image",
+            "image": {
+                "url": {
+                    "uri": base64_image_url,
+                },
+            },
+        },
+        "data": {
+            "type": "image",
+            "image": {
+                "data": base64_image_data,
+            },
+        },
+    }[type_]
+
+    message = {
+        "role": "user",
+        "content": [
+            image_spec,
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
--- a/uv.lock
+++ b/uv.lock
@ -687,7 +687,7 @@ wheels = [

 [[package]]
 name = "llama-models"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "jinja2" },
@ -696,14 +696,14 @@ dependencies = [
    { name = "pyyaml" },
    { name = "tiktoken" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 },
+    { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 },
 ]

 [[package]]
 name = "llama-stack"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
    { name = "blobfile" },
@ -751,8 +751,8 @@ requires-dist = [
    { name = "fire" },
    { name = "httpx" },
    { name = "huggingface-hub" },
-    { name = "llama-models", specifier = ">=0.1.1" },
-    { name = "llama-stack-client", specifier = ">=0.1.1" },
+    { name = "llama-models", specifier = ">=0.1.2" },
+    { name = "llama-stack-client", specifier = ">=0.1.2" },
    { name = "myst-parser", marker = "extra == 'docs'" },
    { name = "nbval", marker = "extra == 'dev'" },
    { name = "pre-commit", marker = "extra == 'dev'" },
@ -780,7 +780,7 @@ requires-dist = [

 [[package]]
 name = "llama-stack-client"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -797,9 +797,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 },
+    { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 },
 ]

 [[package]]