chore: unpublish /inference/chat-completion (#3609)

# What does this PR do? BREAKING CHANGE: removes /inference/chat-completion route and updates relevant documentation ## Test Plan 🤷
2025-12-03 09:53:45 +00:00 · 2025-09-30 14:00:42 -04:00 · 2025-09-30 14:00:42 -04:00 · cb33f45c11
commit cb33f45c11
parent 62e302613f
23 changed files with 1448 additions and 2137 deletions
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@ -102,15 +102,15 @@
        }
      ],
      "source": [
-        "response = client.inference.chat_completion(\n",
+        "response = client.chat.completions.create(\n",
        "    messages=[\n",
        "        {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
        "        {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
        "    ],\n",
-        "    model_id=MODEL_NAME,\n",
+        "    model=MODEL_NAME,\n",
        ")\n",
        "\n",
-        "print(response.completion_message.content)"
+        "print(response.choices[0].message.content)"
      ]
    },
    {
@ -141,14 +141,14 @@
        }
      ],
      "source": [
-        "response = client.inference.chat_completion(\n",
+        "response = client.chat.completions.create(\n",
        "    messages=[\n",
        "        {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n",
        "        {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
        "    ],\n",
-        "    model_id=MODEL_NAME,  # Changed from model to model_id\n",
+        "    model=MODEL_NAME,\n",
        ")\n",
-        "print(response.completion_message.content)"
+        "print(response.choices[0].message.content)"
      ]
    },
    {
@ -218,11 +218,11 @@
        "            break\n",
        "\n",
        "        message = {\"role\": \"user\", \"content\": user_input}\n",
-        "        response = client.inference.chat_completion(\n",
+        "        response = client.chat.completions.create(\n",
        "            messages=[message],\n",
-        "            model_id=MODEL_NAME\n",
+        "            model=MODEL_NAME\n",
        "        )\n",
-        "        cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+        "        cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
        "\n",
        "# Run the chat loop in a Jupyter Notebook cell using await\n",
        "await chat_loop()\n",
@ -288,16 +288,16 @@
        "        user_message = {\"role\": \"user\", \"content\": user_input}\n",
        "        conversation_history.append(user_message)\n",
        "\n",
-        "        response = client.inference.chat_completion(\n",
+        "        response = client.chat.completions.create(\n",
        "            messages=conversation_history,\n",
-        "            model_id=MODEL_NAME,\n",
+        "            model=MODEL_NAME,\n",
        "        )\n",
-        "        cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+        "        cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
        "\n",
        "        # Append the assistant message with all required fields\n",
        "        assistant_message = {\n",
        "            \"role\": \"user\",\n",
-        "            \"content\": response.completion_message.content,\n",
+        "            \"content\": response.choices[0].message.content,\n",
        "            # Add any additional required fields here if necessary\n",
        "        }\n",
        "        conversation_history.append(assistant_message)\n",
@ -349,14 +349,14 @@
        "    }\n",
        "    cprint(f'User> {message[\"content\"]}', 'green')\n",
        "\n",
-        "    response = client.inference.chat_completion(\n",
+        "    response = client.chat.completions.create(\n",
        "        messages=[message],\n",
-        "        model_id=MODEL_NAME,\n",
+        "        model=MODEL_NAME,\n",
        "        stream=stream,\n",
        "    )\n",
        "\n",
        "    if not stream:\n",
-        "        cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+        "        cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
        "    else:\n",
        "        for log in EventLogger().log(response):\n",
        "            log.print()\n",
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -134,15 +134,15 @@
    "    }\n",
    "    cprint(f'User> {message[\"content\"]}', 'green')\n",
    "\n",
-    "    response = await client.inference.chat_completion(\n",
+    "    response = await client.chat.completions.create(\n",
    "        messages=[message],\n",
-    "        model_id='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
+    "        model='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
    "        stream=stream,\n",
    "    )\n",
    "\n",
    "    cprint(f'Assistant> ', color='cyan', end='')\n",
    "    if not stream:\n",
-    "        cprint(response.completion_message.content, color='yellow')\n",
+    "        cprint(response.choices[0].message.content, color='yellow')\n",
    "    else:\n",
    "        async for chunk in response:\n",
    "            cprint(chunk.event.delta.text, color='yellow', end='')\n",
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@ -152,8 +152,8 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "response = client.inference.chat_completion(\n",
-        "    messages=few_shot_examples, model_id=MODEL_NAME\n",
+        "response = client.chat.completions.create(\n",
+        "    messages=few_shot_examples, model=MODEL_NAME\n",
        ")"
      ]
    },
@ -164,7 +164,7 @@
      "source": [
        "#### 4. Display the Model’s Response\n",
        "\n",
-        "The `completion_message` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
+        "The `choices[0].message.content` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
      ]
    },
    {
@ -184,7 +184,7 @@
      "source": [
        "from termcolor import cprint\n",
        "\n",
-        "cprint(f'> Response: {response.completion_message.content}', 'cyan')"
+        "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
      ]
    },
    {
@ -219,7 +219,7 @@
        "\n",
        "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n",
        "\n",
-        "response = client.inference.chat_completion(\n",
+        "response = client.chat.completions.create(\n",
        "    messages=[\n",
        "    {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n",
        "    {\n",
@ -253,10 +253,10 @@
        "        \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n",
        "    }\n",
        "],\n",
-        "    model_id=MODEL_NAME,\n",
+        "    model=MODEL_NAME,\n",
        ")\n",
        "\n",
-        "cprint(f'> Response: {response.completion_message.content}', 'cyan')"
+        "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
      ]
    },
    {
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@ -102,15 +102,15 @@
        "    }\n",
        "\n",
        "    cprint(\"User> Sending image for analysis...\", \"green\")\n",
-        "    response = client.inference.chat_completion(\n",
+        "    response = client.chat.completions.create(\n",
        "        messages=[message],\n",
-        "        model_id=MODEL_NAME,\n",
+        "        model=MODEL_NAME,\n",
        "        stream=stream,\n",
        "    )\n",
        "\n",
        "    cprint(f'Assistant> ', color='cyan', end='')\n",
        "    if not stream:\n",
-        "        cprint(response.completion_message.content, color='yellow')\n",
+        "        cprint(response.choices[0].message.content, color='yellow')\n",
        "    else:\n",
        "        for chunk in response:\n",
        "            cprint(chunk.event.delta.text, color='yellow', end='')\n",
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -131,14 +131,37 @@ After setting up the server, open a new terminal window and configure the llama-
   ```
   **Expected Output:**
   ```bash
-   ChatCompletionResponse(
-       completion_message=CompletionMessage(
-           content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
-           role='assistant',
-           stop_reason='end_of_turn',
-           tool_calls=[]
-       ),
-       logprobs=None
+   OpenAIChatCompletion(
+      id='chatcmpl-950',
+      choices=[
+         OpenAIChatCompletionChoice(
+               finish_reason='stop',
+               index=0,
+               message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                  role='assistant',
+                  content='...The moon casts silver threads through the velvet night, a silent bard of shadows, ancient and bright.',
+                  name=None,
+                  tool_calls=None,
+                  refusal=None,
+                  annotations=None,
+                  audio=None,
+                  function_call=None
+               ),
+               logprobs=None
+         )
+      ],
+      created=1759240813,
+      model='meta-llama/Llama-3.2-3B-Instruct',
+      object='chat.completion',
+      service_tier=None,
+      system_fingerprint='fp_ollama',
+      usage={
+         'completion_tokens': 479,
+         'prompt_tokens': 19,
+         'total_tokens': 498,
+         'completion_tokens_details': None,
+         'prompt_tokens_details': None
+      },
   )
   ```

@ -147,21 +170,16 @@ After setting up the server, open a new terminal window and configure the llama-
 After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:

 ```bash
-curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
+curl http://localhost:$LLAMA_STACK_PORT/v1/chat/completions
 -H "Content-Type: application/json"
 -d @- <<EOF
 {
-    "model_id": "$INFERENCE_MODEL",
+    "model": "$INFERENCE_MODEL",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
    ],
-    "sampling_params": {
-      "strategy": {
-         "type": "top_p",
-         "temperatrue": 0.7,
-         "top_p": 0.95,
-      },
+      "temperature": 0.7,
      "seed": 42,
      "max_tokens": 512
   }
@ -174,13 +192,9 @@ You can check the available models with the command `uv run --with llama-stack-c
 **Expected Output:**
 ```json
 {
-  "completion_message": {
-    "role": "assistant",
-    "content": "The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
-    "stop_reason": "out_of_tokens",
-    "tool_calls": []
-  },
-  "logprobs": null
+    ...
+    "content": "... The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
+    ...
 }
 ```

@ -213,17 +227,17 @@ if INFERENCE_MODEL is None:
 # Initialize the clien
 client = LlamaStackClient(base_url="http://localhost:8321")

-# Create a chat completion reques
-response = client.inference.chat_completion(
+# Create a chat completion request
+response = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a friendly assistant."},
        {"role": "user", "content": "Write a two-sentence poem about llama."},
    ],
-    model_id=INFERENCE_MODEL,
+    model=INFERENCE_MODEL,
 )

 # Print the response
-print(response.completion_message.content)
+print(response.choices[0].message.content)
 ```

 ### 3. Run the Python Script