chore: unpublish /inference/chat-completion (#3609)

# What does this PR do? BREAKING CHANGE: removes /inference/chat-completion route and updates relevant documentation ## Test Plan 🤷
2025-12-08 11:07:22 +00:00 · 2025-09-30 14:00:42 -04:00 · 2025-09-30 14:00:42 -04:00 · cb33f45c11
commit cb33f45c11
parent 62e302613f
23 changed files with 1448 additions and 2137 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1030,7 +1030,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
        model_id: str,
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -44,8 +44,8 @@ client.initialize()
 The following example shows how to create a chat completion for an NVIDIA NIM.

 ```python
-response = client.inference.chat_completion(
-    model_id="meta-llama/Llama-3.1-8B-Instruct",
+response = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
    messages=[
        {
            "role": "system",
@ -57,11 +57,9 @@ response = client.inference.chat_completion(
        },
    ],
    stream=False,
-    sampling_params={
-        "max_tokens": 50,
-    },
+    max_tokens=50,
 )
-print(f"Response: {response.completion_message.content}")
+print(f"Response: {response.choices[0].message.content}")
 ```

 ### Tool Calling Example ###
@ -89,15 +87,15 @@ tool_definition = ToolDefinition(
    },
 )

-tool_response = client.inference.chat_completion(
-    model_id="meta-llama/Llama-3.1-8B-Instruct",
+tool_response = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
    tools=[tool_definition],
 )

-print(f"Tool Response: {tool_response.completion_message.content}")
-if tool_response.completion_message.tool_calls:
-    for tool_call in tool_response.completion_message.tool_calls:
+print(f"Tool Response: {tool_response.choices[0].message.content}")
+if tool_response.choices[0].message.tool_calls:
+    for tool_call in tool_response.choices[0].message.tool_calls:
        print(f"Tool Called: {tool_call.tool_name}")
        print(f"Arguments: {tool_call.arguments}")
 ```
@ -123,8 +121,8 @@ response_format = JsonSchemaResponseFormat(
    type=ResponseFormatType.json_schema, json_schema=person_schema
 )

-structured_response = client.inference.chat_completion(
-    model_id="meta-llama/Llama-3.1-8B-Instruct",
+structured_response = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
    messages=[
        {
            "role": "user",
@ -134,7 +132,7 @@ structured_response = client.inference.chat_completion(
    response_format=response_format,
 )

-print(f"Structured Response: {structured_response.completion_message.content}")
+print(f"Structured Response: {structured_response.choices[0].message.content}")
 ```

 ### Create Embeddings
@ -167,8 +165,8 @@ def load_image_as_base64(image_path):
 image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)

-vlm_response = client.inference.chat_completion(
-    model_id="nvidia/vila",
+vlm_response = client.chat.completions.create(
+    model="nvidia/vila",
    messages=[
        {
            "role": "user",
@ -188,5 +186,5 @@ vlm_response = client.inference.chat_completion(
    ],
 )

-print(f"VLM Response: {vlm_response.completion_message.content}")
+print(f"VLM Response: {vlm_response.choices[0].message.content}")
 ```