Merge branch 'main' into alexey-doc-links-update

2025-12-19 15:38:40 +00:00 · 2025-09-16 11:09:25 -07:00 · 2025-09-16 11:09:25 -07:00 · 3bccc57f34
commit 3bccc57f34
parent d915eaaced 6b855af96f
46 changed files with 13855 additions and 472 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1380,6 +1380,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "Unregister a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/openai/v1/chat/completions/{completion_id}": {
@ -1620,6 +1654,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "Unregister a scoring function.",
+                "parameters": [
+                    {
+                        "name": "scoring_fn_id",
+                        "in": "path",
+                        "description": "The ID of the scoring function to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/shields/{identifier}": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -954,6 +954,30 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Benchmarks
+      description: Unregister a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: The ID of the benchmark to unregister.
+          required: true
+          schema:
+            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
@ -1119,6 +1143,31 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
  /v1/shields/{identifier}:
    get:
      responses:
--- a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
+++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
@ -0,0 +1,701 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1ztegmwm4sp",
+   "metadata": {},
+   "source": [
+    "## LlamaStack + LangChain Integration Tutorial\n",
+    "\n",
+    "This notebook demonstrates how to integrate **LlamaStack** with **LangChain** to build a complete RAG (Retrieval-Augmented Generation) system.\n",
+    "\n",
+    "### Overview\n",
+    "\n",
+    "- **LlamaStack**: Provides the infrastructure for running LLMs and Open AI Compatible Vector Stores\n",
+    "- **LangChain**: Provides the framework for chaining operations and prompt templates\n",
+    "- **Integration**: Uses LlamaStack's OpenAI-compatible API with LangChain\n",
+    "\n",
+    "### What You'll See\n",
+    "\n",
+    "1. Setting up LlamaStack server with Fireworks AI provider\n",
+    "2. Creating and Querying Vector Stores\n",
+    "3. Building RAG chains with LangChain + LLAMAStack\n",
+    "4. Querying the chain for relevant information\n",
+    "\n",
+    "### Prerequisites\n",
+    "\n",
+    "- Fireworks API key\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### 1. Installation and Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ktr5ls2cas",
+   "metadata": {},
+   "source": [
+    "#### Install Required Dependencies\n",
+    "\n",
+    "First, we install all the necessary packages for LangChain and FastAPI integration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5b6a6a17-b931-4bea-8273-0d6e5563637a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: uv in /Users/swapna942/miniconda3/lib/python3.12/site-packages (0.7.20)\n",
+      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
+      "\u001b[2mAudited \u001b[1m7 packages\u001b[0m \u001b[2min 42ms\u001b[0m\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install uv\n",
+    "!uv pip install fastapi uvicorn \"langchain>=0.2\" langchain-openai \\\n",
+    "             langchain-community langchain-text-splitters \\\n",
+    "             faiss-cpu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wmt9jvqzh7n",
+   "metadata": {},
+   "source": [
+    "### 2. LlamaStack Server Setup\n",
+    "\n",
+    "#### Build and Start LlamaStack Server\n",
+    "\n",
+    "This section sets up the LlamaStack server with:\n",
+    "- **Fireworks AI** as the inference provider\n",
+    "- **Sentence Transformers** for embeddings\n",
+    "\n",
+    "The server runs on `localhost:8321` and provides OpenAI-compatible endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dd2dacf3-ec8b-4cc7-8ff4-b5b6ea4a6e9e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "import time\n",
+    "\n",
+    "# Remove UV_SYSTEM_PYTHON to ensure uv creates a proper virtual environment\n",
+    "# instead of trying to use system Python globally, which could cause permission issues\n",
+    "# and package conflicts with the system's Python installation\n",
+    "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
+    "    del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
+    "\n",
+    "def run_llama_stack_server_background():\n",
+    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
+    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
+    "    process = subprocess.Popen(\n",
+    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
+    "        shell=True,\n",
+    "        stdout=log_file,\n",
+    "        stderr=log_file,\n",
+    "        text=True,\n",
+    "    )\n",
+    "\n",
+    "    print(f\"Building and starting Llama Stack server with PID: {process.pid}\")\n",
+    "    return process\n",
+    "\n",
+    "\n",
+    "def wait_for_server_to_start():\n",
+    "    import requests\n",
+    "    from requests.exceptions import ConnectionError\n",
+    "\n",
+    "    url = \"http://0.0.0.0:8321/v1/health\"\n",
+    "    max_retries = 30\n",
+    "    retry_interval = 1\n",
+    "\n",
+    "    print(\"Waiting for server to start\", end=\"\")\n",
+    "    for _ in range(max_retries):\n",
+    "        try:\n",
+    "            response = requests.get(url)\n",
+    "            if response.status_code == 200:\n",
+    "                print(\"\\nServer is ready!\")\n",
+    "                return True\n",
+    "        except ConnectionError:\n",
+    "            print(\".\", end=\"\", flush=True)\n",
+    "            time.sleep(retry_interval)\n",
+    "\n",
+    "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
+    "    return False\n",
+    "\n",
+    "\n",
+    "def kill_llama_stack_server():\n",
+    "    # Kill any existing llama stack server processes using pkill command\n",
+    "    os.system(\"pkill -f llama_stack.core.server.server\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "28bd8dbd-4576-4e76-813f-21ab94db44a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building and starting Llama Stack server with PID: 19747\n",
+      "Waiting for server to start....\n",
+      "Server is ready!\n"
+     ]
+    }
+   ],
+   "source": [
+    "server_process = run_llama_stack_server_background()\n",
+    "assert wait_for_server_to_start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gr9cdcg4r7n",
+   "metadata": {},
+   "source": [
+    "#### Install LlamaStack Client\n",
+    "\n",
+    "Install the client library to interact with the LlamaStack server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "487d2dbc-d071-400e-b4f0-dcee58f8dc95",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
+      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 27ms\u001b[0m\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!uv pip install llama_stack_client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0j5hag7l9x89",
+   "metadata": {},
+   "source": [
+    "### 3. Initialize LlamaStack Client\n",
+    "\n",
+    "Create a client connection to the LlamaStack server with API keys for different providers:\n",
+    "\n",
+    "- **Fireworks API Key**: For Fireworks models\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ab4eff97-4565-4c73-b1b3-0020a4c7e2a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_stack_client import LlamaStackClient\n",
+    "\n",
+    "client = LlamaStackClient(\n",
+    "    base_url=\"http://0.0.0.0:8321\",\n",
+    "    provider_data={\"fireworks_api_key\": \"***\"},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "vwhexjy1e8o",
+   "metadata": {},
+   "source": [
+    "#### Explore Available Models and Safety Features\n",
+    "\n",
+    "Check what models and safety shields are available through your LlamaStack instance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "880443ef-ac3c-48b1-a80a-7dab5b25ac61",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/shields \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available Fireworks models:\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-70b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p1-405b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-3b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-11b-vision-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama-v3p3-70b-instruct\n",
+      "- fireworks/accounts/fireworks/models/llama4-scout-instruct-basic\n",
+      "- fireworks/accounts/fireworks/models/llama4-maverick-instruct-basic\n",
+      "- fireworks/nomic-ai/nomic-embed-text-v1.5\n",
+      "- fireworks/accounts/fireworks/models/llama-guard-3-8b\n",
+      "- fireworks/accounts/fireworks/models/llama-guard-3-11b-vision\n",
+      "----\n",
+      "Available shields (safety models):\n",
+      "code-scanner\n",
+      "llama-guard\n",
+      "nemo-guardrail\n",
+      "----\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Available Fireworks models:\")\n",
+    "for m in client.models.list():\n",
+    "    if m.identifier.startswith(\"fireworks/\"):\n",
+    "        print(f\"- {m.identifier}\")\n",
+    "\n",
+    "print(\"----\")\n",
+    "print(\"Available shields (safety models):\")\n",
+    "for s in client.shields.list():\n",
+    "    print(s.identifier)\n",
+    "print(\"----\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gojp7at31ht",
+   "metadata": {},
+   "source": [
+    "### 4. Vector Store Setup\n",
+    "\n",
+    "#### Create a Vector Store with File Upload\n",
+    "\n",
+    "Create a vector store using the OpenAI-compatible vector stores API:\n",
+    "\n",
+    "- **Vector Store**: OpenAI-compatible vector store for document storage\n",
+    "- **File Upload**: Automatic chunking and embedding of uploaded files  \n",
+    "- **Embedding Model**: Sentence Transformers model for text embeddings\n",
+    "- **Dimensions**: 384-dimensional embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "be2c2899-ea53-4e5f-b6b8-ed425f5d6572",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File(id='file-54652c95c56c4c34918a97d7ff8a4320', bytes=41, created_at=1757442621, expires_at=1788978621, filename='shipping_policy.txt', object='file', purpose='assistants')\n",
+      "File(id='file-fb1227c1d1854da1bd774d21e5b7e41c', bytes=48, created_at=1757442621, expires_at=1788978621, filename='returns_policy.txt', object='file', purpose='assistants')\n",
+      "File(id='file-673f874852fe42798675a13d06a256e2', bytes=45, created_at=1757442621, expires_at=1788978621, filename='support.txt', object='file', purpose='assistants')\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores \"HTTP/1.1 200 OK\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from io import BytesIO\n",
+    "\n",
+    "docs = [\n",
+    "    (\"Acme ships globally in 3-5 business days.\", {\"title\": \"Shipping Policy\"}),\n",
+    "    (\"Returns are accepted within 30 days of purchase.\", {\"title\": \"Returns Policy\"}),\n",
+    "    (\"Support is available 24/7 via chat and email.\", {\"title\": \"Support\"}),\n",
+    "]\n",
+    "\n",
+    "file_ids = []\n",
+    "for content, metadata in docs:\n",
+    "  with BytesIO(content.encode()) as file_buffer:\n",
+    "      file_buffer.name = f\"{metadata['title'].replace(' ', '_').lower()}.txt\"\n",
+    "      create_file_response = client.files.create(file=file_buffer, purpose=\"assistants\")\n",
+    "      print(create_file_response)\n",
+    "      file_ids.append(create_file_response.id)\n",
+    "\n",
+    "# Create vector store with files\n",
+    "vector_store = client.vector_stores.create(\n",
+    "  name=\"acme_docs\",\n",
+    "  file_ids=file_ids,\n",
+    "  embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
+    "  embedding_dimension=384,\n",
+    "  provider_id=\"faiss\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9061tmi1zpq",
+   "metadata": {},
+   "source": [
+    "#### Test Vector Store Search\n",
+    "\n",
+    "Query the vector store. This performs semantic search to find relevant documents based on the query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ba9d1901-bd5e-4216-b3e6-19dc74551cc6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Acme ships globally in 3-5 business days.\n",
+      "Returns are accepted within 30 days of purchase.\n"
+     ]
+    }
+   ],
+   "source": [
+    "search_response = client.vector_stores.search(\n",
+    "  vector_store_id=vector_store.id,\n",
+    "  query=\"How long does shipping take?\",\n",
+    "  max_num_results=2\n",
+    ")\n",
+    "for result in search_response.data:\n",
+    "  content = result.content[0].text\n",
+    "  print(content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "usne6mbspms",
+   "metadata": {},
+   "source": [
+    "### 5. LangChain Integration\n",
+    "\n",
+    "#### Configure LangChain with LlamaStack\n",
+    "\n",
+    "Set up LangChain to use LlamaStack's OpenAI-compatible API:\n",
+    "\n",
+    "- **Base URL**: Points to LlamaStack's OpenAI endpoint\n",
+    "- **Headers**: Include Fireworks API key for model access\n",
+    "- **Model**: Use Meta Llama v3p1 8b instruct model for inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c378bd10-09c2-417c-bdfc-1e0a2dd19084",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "# Point LangChain to Llamastack Server\n",
+    "llm = ChatOpenAI(\n",
+    "    base_url=\"http://0.0.0.0:8321/v1/openai/v1\",\n",
+    "    api_key=\"dummy\",\n",
+    "    model=\"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\",\n",
+    "    default_headers={\"X-LlamaStack-Provider-Data\": '{\"fireworks_api_key\": \"***\"}'},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a4ddpcuk3l",
+   "metadata": {},
+   "source": [
+    "#### Test LLM Connection\n",
+    "\n",
+    "Verify that LangChain can successfully communicate with the LlamaStack server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f88ffb5a-657b-4916-9375-c6ddc156c25e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "AIMessage(content=\"A llama's gentle eyes shine bright,\\nIn the Andes, it roams through morning light.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': None, 'model_name': 'fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct', 'system_fingerprint': None, 'id': 'chatcmpl-602b5967-82a3-476b-9cd2-7d3b29b76ee8', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--0933c465-ff4d-4a7b-b7fb-fd97dd8244f3-0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test llm with simple message\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
+    "    {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
+    "]\n",
+    "llm.invoke(messages)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0xh0jg6a0l4a",
+   "metadata": {},
+   "source": [
+    "### 6. Building the RAG Chain\n",
+    "\n",
+    "#### Create a Complete RAG Pipeline\n",
+    "\n",
+    "Build a LangChain pipeline that combines:\n",
+    "\n",
+    "1. **Vector Search**: Query LlamaStack's Open AI compatible Vector Store\n",
+    "2. **Context Assembly**: Format retrieved documents\n",
+    "3. **Prompt Template**: Structure the input for the LLM\n",
+    "4. **LLM Generation**: Generate answers using context\n",
+    "5. **Output Parsing**: Extract the final response\n",
+    "\n",
+    "**Chain Flow**: `Query → Vector Search → Context + Question → LLM → Response`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9684427d-dcc7-4544-9af5-8b110d014c42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# LangChain for prompt template and chaining + LLAMA Stack Client Vector DB and LLM chat completion\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
+    "\n",
+    "\n",
+    "def join_docs(docs):\n",
+    "    return \"\\n\\n\".join([f\"[{d.filename}] {d.content[0].text}\" for d in docs.data])\n",
+    "\n",
+    "PROMPT = ChatPromptTemplate.from_messages(\n",
+    "    [\n",
+    "        (\"system\", \"You are a helpful assistant. Use the following context to answer.\"),\n",
+    "        (\"user\", \"Question: {question}\\n\\nContext:\\n{context}\"),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "vector_step = RunnableLambda(\n",
+    "      lambda x: client.vector_stores.search(\n",
+    "          vector_store_id=vector_store.id,\n",
+    "          query=x,\n",
+    "          max_num_results=2\n",
+    "      )\n",
+    "  )\n",
+    "\n",
+    "chain = (\n",
+    "    {\"context\": vector_step | RunnableLambda(join_docs), \"question\": RunnablePassthrough()}\n",
+    "    | PROMPT\n",
+    "    | llm\n",
+    "    | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0onu6rhphlra",
+   "metadata": {},
+   "source": [
+    "### 7. Testing the RAG System\n",
+    "\n",
+    "#### Example 1: Shipping Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "03322188-9509-446a-a4a8-ce3bb83ec87c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❓ How long does shipping take?\n",
+      "💡 Acme ships globally in 3-5 business days. This means that shipping typically takes between 3 to 5 working days from the date of dispatch or order fulfillment.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"How long does shipping take?\"\n",
+    "response = chain.invoke(query)\n",
+    "print(\"❓\", query)\n",
+    "print(\"💡\", response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7krhqj88ku",
+   "metadata": {},
+   "source": [
+    "#### Example 2: Returns Policy Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "61995550-bb0b-46a8-a5d0-023207475d60",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❓ Can I return a product after 40 days?\n",
+      "💡 Based on the provided context, you cannot return a product after 40 days. The return window is limited to 30 days from the date of purchase.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"Can I return a product after 40 days?\"\n",
+    "response = chain.invoke(query)\n",
+    "print(\"❓\", query)\n",
+    "print(\"💡\", response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "h4w24fadvjs",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "We have successfully built a RAG system that combines:\n",
+    "\n",
+    "- **LlamaStack** for infrastructure (LLM serving + Vector Store)\n",
+    "- **LangChain** for orchestration (prompts + chains)\n",
+    "- **Fireworks** for high-quality language models\n",
+    "\n",
+    "### Key Benefits\n",
+    "\n",
+    "1. **Unified Infrastructure**: Single server for LLMs and Vector Store\n",
+    "2. **OpenAI Compatibility**: Easy integration with existing LangChain code\n",
+    "3. **Multi-Provider Support**: Switch between different LLM providers\n",
+    "4. **Production Ready**: Built-in safety shields and monitoring\n",
+    "\n",
+    "### Next Steps\n",
+    "\n",
+    "- Add more sophisticated document processing\n",
+    "- Implement conversation memory\n",
+    "- Add safety filtering and monitoring\n",
+    "- Scale to larger document collections\n",
+    "- Integrate with web frameworks like FastAPI or Streamlit\n",
+    "\n",
+    "---\n",
+    "\n",
+    "##### 🔧 Cleanup\n",
+    "\n",
+    "Don't forget to stop the LlamaStack server when you're done:\n",
+    "\n",
+    "```python\n",
+    "kill_llama_stack_server()\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "15647c46-22ce-4698-af3f-8161329d8e3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kill_llama_stack_server()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/source/apis/api_leveling.md
+++ b/docs/source/apis/api_leveling.md
@ -0,0 +1,94 @@
+# Llama Stack API Stability Leveling
+
+In order to provide a stable experience in Llama Stack, the various APIs need different stability levels indicating the level of support, backwards compatability, and overall production readiness.
+
+## Different Levels
+
+### v1alpha
+
+- Little to no expectation of support between versions
+- Breaking changes are permitted
+- Datatypes and parameters can break
+- Routes can be added and removed
+
+#### Graduation Criteria
+
+- an API can graduate from `v1alpha` to `v1beta` if the team has identified the extent of the non-optional routes and the shape of their parameters/return types for the API eg. `/v1/openai/chat/completions`. Optional types can change.
+- CRUD must stay stable once in `v1beta`. This is a commitment to backward compatibility, guaranteeing that most code you write against the v1beta version will not break during future updates. We may make additive changes (like adding a new, optional field to a response), but we will not make breaking changes (like renaming an existing "modelName" field to "name", changing an ID's data type from an integer to a string, or altering an endpoint URL).
+- for OpenAI APIs, a comparison to the OpenAI spec for the specific API can be done to ensure completeness.
+
+### v1beta
+
+- API routes remain consistent between versions
+- Parameters and return types are not ensured between versions
+- API, besides minor fixes and adjustments, should be _almost_ v1. Changes should not be drastic.
+
+#### Graduation Criteria
+
+- an API can graduate from `v1beta` to `v1` if the API surface and datatypes are complete as identified by the team. The parameters and return types that are mandatory for each route are stable. All aspects of graduating from `v1alpha1` to `v1beta` apply as well.
+- Optional parameters, routes, or parts of the return type can be added after graduating to `v1`
+
+### v1 (stable)
+
+- Considered stable
+- Backwards compatible between Z-streams
+  - Y-stream breaking changes must go through the proper approval and announcement process.
+- Datatypes for a route and its return types cannot change between Z-streams
+  - Y-stream datatype changes should be sparing, unless the changes are additional net-new parameters
+- Must have proper conformance testing as outlined in https://github.com/llamastack/llama-stack/issues/3237
+
+### v2+ (Major Versions)
+
+Introducing a new major version like `/v2` is a significant and disruptive event that should be treated as a last resort. It is reserved for essential changes to a stable `/v1` API that are fundamentally backward-incompatible and cannot be implemented through additive, non-breaking changes or breaking changes across X/Y-Stream releases (x.y.z).
+
+If a `/v2` version is deemed absolutely necessary, it must adhere to the following protocol to ensure a sane and predictable transition for users:
+
+#### Lifecycle Progression
+
+ A new major version must follow the same stability lifecycle as `/v1`. It will be introduced as `/v2alpha`, mature to `/v2beta`, and finally become stable as `/v2`.
+
+#### Coexistence:
+
+The new `/v2` API must be introduced alongside the existing `/v1` API and run in parallel. It must not replace the `/v1` API immediately.
+
+#### Deprecation Policy:
+
+When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
+
+### API Stability vs. Provider Stability
+
+The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
+
+Providers can iterate as much as they want on functionality as long as they work within the bounds of an API. If they need to change the API, then the API should not be `/v1`, or those breaking changes can only happen on a y-stream release basis.
+
+### Approval and Announcement Process for Breaking Changes
+
+- **PR Labeling**: Any pull request that introduces a breaking API change must be clearly labeled with `breaking-change`.
+- **PR Title/Commit**: Any pull request that introduces a breaking API change must contain `BREAKING CHANGE` in the title and commit footer. Alternatively, the commit can include `!`, eg. `feat(api)!: title goes here` This is outlined in the [conventional commits documentation](https://www.conventionalcommits.org/en/v1.0.0/#specification)
+- **Maintainer Review**: At least one maintainer must explicitly acknowledge the breaking change during review by applying the `breaking-change` label. An approval must come with this label or the acknowledgement this label has already been applied.
+- **Announcement**: Breaking changes require inclusion in release notes and, if applicable, a separate communication (e.g., Discord, Github Issues, or GitHub Discussions) prior to release.
+
+If a PR has proper approvals, labels, and commit/title hygiene, the failing API conformance tests will be bypassed.
+
+
+## Enforcement
+
+### Migration of API routes under `/v1alpha`, `/v1beta`, and `/v1`
+
+Instead of placing every API under `/v1`, any API that is not fully stable or complete should go under `/v1alpha` or `/v1beta`. For example, at the time of this writing,  `post_training` belongs here, as well as any OpenAI-compatible API whose surface does not exactly match the upstream OpenAI API it mimics.
+
+This migration is crucial as we get Llama Stack in the hands of users who intend to productize various APIs. A clear view of what is stable and what is actively being developed will enable users to pick and choose various APIs to build their products on.
+
+This migration will be a breaking change for any API moving out of `/v1`. Ideally, this should happen before 0.3.0 and especially 1.0.0.
+
+### `x-stability` tags in the OpenAPI spec for oasdiff
+
+`x-stability` tags allow tools like oasdiff to enforce different rules for different stability levels; these tags should match the routes: [oasdiff stability](https://github.com/oasdiff/oasdiff/blob/main/docs/STABILITY.md)
+
+### Testing
+
+The testing of each stable API is already outlined in [issue #3237](https://github.com/llamastack/llama-stack/issues/3237) and is being worked on. These sorts of conformance tests should apply primarily to `/v1` APIs only, with `/v1alpha` and `/v1beta` having any tests the maintainers see fit as well as basic testing to ensure the routing works properly.
+
+### New APIs going forward
+
+Any subsequently introduced APIs should be introduced as `/v1alpha`
--- a/docs/source/providers/inference/remote_watsonx.md
+++ b/docs/source/providers/inference/remote_watsonx.md
@ -9,8 +9,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key, only needed of using the hosted service |
-| `project_id` | `str \| None` | No |  | The Project ID key, only needed of using the hosted service |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
+| `project_id` | `str \| None` | No |  | The Project ID key |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration