chore: unpublish /inference/chat-completion (#3609)

# What does this PR do?

BREAKING CHANGE: removes /inference/chat-completion route and updates
relevant documentation

## Test Plan

🤷
This commit is contained in:
Matthew Farrellee 2025-09-30 14:00:42 -04:00 committed by GitHub
parent 62e302613f
commit cb33f45c11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1448 additions and 2137 deletions

View file

@ -1030,7 +1030,6 @@ class InferenceProvider(Protocol):
"""
...
@webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
async def chat_completion(
self,
model_id: str,

View file

@ -44,8 +44,8 @@ client.initialize()
The following example shows how to create a chat completion for an NVIDIA NIM.
```python
response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[
{
"role": "system",
@ -57,11 +57,9 @@ response = client.inference.chat_completion(
},
],
stream=False,
sampling_params={
"max_tokens": 50,
},
max_tokens=50,
)
print(f"Response: {response.completion_message.content}")
print(f"Response: {response.choices[0].message.content}")
```
### Tool Calling Example ###
@ -89,15 +87,15 @@ tool_definition = ToolDefinition(
},
)
tool_response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
tool_response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=[tool_definition],
)
print(f"Tool Response: {tool_response.completion_message.content}")
if tool_response.completion_message.tool_calls:
for tool_call in tool_response.completion_message.tool_calls:
print(f"Tool Response: {tool_response.choices[0].message.content}")
if tool_response.choices[0].message.tool_calls:
for tool_call in tool_response.choices[0].message.tool_calls:
print(f"Tool Called: {tool_call.tool_name}")
print(f"Arguments: {tool_call.arguments}")
```
@ -123,8 +121,8 @@ response_format = JsonSchemaResponseFormat(
type=ResponseFormatType.json_schema, json_schema=person_schema
)
structured_response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
structured_response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[
{
"role": "user",
@ -134,7 +132,7 @@ structured_response = client.inference.chat_completion(
response_format=response_format,
)
print(f"Structured Response: {structured_response.completion_message.content}")
print(f"Structured Response: {structured_response.choices[0].message.content}")
```
### Create Embeddings
@ -167,8 +165,8 @@ def load_image_as_base64(image_path):
image_path = {path_to_the_image}
demo_image_b64 = load_image_as_base64(image_path)
vlm_response = client.inference.chat_completion(
model_id="nvidia/vila",
vlm_response = client.chat.completions.create(
model="nvidia/vila",
messages=[
{
"role": "user",
@ -188,5 +186,5 @@ vlm_response = client.inference.chat_completion(
],
)
print(f"VLM Response: {vlm_response.completion_message.content}")
print(f"VLM Response: {vlm_response.choices[0].message.content}")
```