chore: unpublish /inference/chat-completion (#3609)

# What does this PR do?

BREAKING CHANGE: removes /inference/chat-completion route and updates
relevant documentation

## Test Plan

🤷
This commit is contained in:
Matthew Farrellee 2025-09-30 14:00:42 -04:00 committed by GitHub
parent 62e302613f
commit cb33f45c11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 1448 additions and 2137 deletions

View file

@ -102,15 +102,15 @@
}
],
"source": [
"response = client.inference.chat_completion(\n",
"response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n",
" model_id=MODEL_NAME,\n",
" model=MODEL_NAME,\n",
")\n",
"\n",
"print(response.completion_message.content)"
"print(response.choices[0].message.content)"
]
},
{
@ -141,14 +141,14 @@
}
],
"source": [
"response = client.inference.chat_completion(\n",
"response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n",
" model_id=MODEL_NAME, # Changed from model to model_id\n",
" model=MODEL_NAME,\n",
")\n",
"print(response.completion_message.content)"
"print(response.choices[0].message.content)"
]
},
{
@ -218,11 +218,11 @@
" break\n",
"\n",
" message = {\"role\": \"user\", \"content\": user_input}\n",
" response = client.inference.chat_completion(\n",
" response = client.chat.completions.create(\n",
" messages=[message],\n",
" model_id=MODEL_NAME\n",
" model=MODEL_NAME\n",
" )\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n",
"# Run the chat loop in a Jupyter Notebook cell using await\n",
"await chat_loop()\n",
@ -288,16 +288,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
" response = client.inference.chat_completion(\n",
" response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
" model_id=MODEL_NAME,\n",
" model=MODEL_NAME,\n",
" )\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n",
" # Append the assistant message with all required fields\n",
" assistant_message = {\n",
" \"role\": \"user\",\n",
" \"content\": response.completion_message.content,\n",
" \"content\": response.choices[0].message.content,\n",
" # Add any additional required fields here if necessary\n",
" }\n",
" conversation_history.append(assistant_message)\n",
@ -349,14 +349,14 @@
" }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n",
" response = client.inference.chat_completion(\n",
" response = client.chat.completions.create(\n",
" messages=[message],\n",
" model_id=MODEL_NAME,\n",
" model=MODEL_NAME,\n",
" stream=stream,\n",
" )\n",
"\n",
" if not stream:\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
" else:\n",
" for log in EventLogger().log(response):\n",
" log.print()\n",

View file

@ -134,15 +134,15 @@
" }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n",
" response = await client.inference.chat_completion(\n",
" response = await client.chat.completions.create(\n",
" messages=[message],\n",
" model_id='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
" model='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
" stream=stream,\n",
" )\n",
"\n",
" cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n",
" cprint(response.completion_message.content, color='yellow')\n",
" cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n",
" async for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n",

View file

@ -152,8 +152,8 @@
"metadata": {},
"outputs": [],
"source": [
"response = client.inference.chat_completion(\n",
" messages=few_shot_examples, model_id=MODEL_NAME\n",
"response = client.chat.completions.create(\n",
" messages=few_shot_examples, model=MODEL_NAME\n",
")"
]
},
@ -164,7 +164,7 @@
"source": [
"#### 4. Display the Models Response\n",
"\n",
"The `completion_message` contains the assistants generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
"The `choices[0].message.content` contains the assistants generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
]
},
{
@ -184,7 +184,7 @@
"source": [
"from termcolor import cprint\n",
"\n",
"cprint(f'> Response: {response.completion_message.content}', 'cyan')"
"cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
]
},
{
@ -219,7 +219,7 @@
"\n",
"client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n",
"\n",
"response = client.inference.chat_completion(\n",
"response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n",
" {\n",
@ -253,10 +253,10 @@
" \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n",
" }\n",
"],\n",
" model_id=MODEL_NAME,\n",
" model=MODEL_NAME,\n",
")\n",
"\n",
"cprint(f'> Response: {response.completion_message.content}', 'cyan')"
"cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
]
},
{

View file

@ -102,15 +102,15 @@
" }\n",
"\n",
" cprint(\"User> Sending image for analysis...\", \"green\")\n",
" response = client.inference.chat_completion(\n",
" response = client.chat.completions.create(\n",
" messages=[message],\n",
" model_id=MODEL_NAME,\n",
" model=MODEL_NAME,\n",
" stream=stream,\n",
" )\n",
"\n",
" cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n",
" cprint(response.completion_message.content, color='yellow')\n",
" cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n",
" for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n",

View file

@ -131,14 +131,37 @@ After setting up the server, open a new terminal window and configure the llama-
```
**Expected Output:**
```bash
ChatCompletionResponse(
completion_message=CompletionMessage(
content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
role='assistant',
stop_reason='end_of_turn',
tool_calls=[]
),
logprobs=None
OpenAIChatCompletion(
id='chatcmpl-950',
choices=[
OpenAIChatCompletionChoice(
finish_reason='stop',
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role='assistant',
content='...The moon casts silver threads through the velvet night, a silent bard of shadows, ancient and bright.',
name=None,
tool_calls=None,
refusal=None,
annotations=None,
audio=None,
function_call=None
),
logprobs=None
)
],
created=1759240813,
model='meta-llama/Llama-3.2-3B-Instruct',
object='chat.completion',
service_tier=None,
system_fingerprint='fp_ollama',
usage={
'completion_tokens': 479,
'prompt_tokens': 19,
'total_tokens': 498,
'completion_tokens_details': None,
'prompt_tokens_details': None
},
)
```
@ -147,21 +170,16 @@ After setting up the server, open a new terminal window and configure the llama-
After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
```bash
curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
curl http://localhost:$LLAMA_STACK_PORT/v1/chat/completions
-H "Content-Type: application/json"
-d @- <<EOF
{
"model_id": "$INFERENCE_MODEL",
"model": "$INFERENCE_MODEL",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write me a 2-sentence poem about the moon"}
],
"sampling_params": {
"strategy": {
"type": "top_p",
"temperatrue": 0.7,
"top_p": 0.95,
},
"temperature": 0.7,
"seed": 42,
"max_tokens": 512
}
@ -174,13 +192,9 @@ You can check the available models with the command `uv run --with llama-stack-c
**Expected Output:**
```json
{
"completion_message": {
"role": "assistant",
"content": "The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
"stop_reason": "out_of_tokens",
"tool_calls": []
},
"logprobs": null
...
"content": "... The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
...
}
```
@ -213,17 +227,17 @@ if INFERENCE_MODEL is None:
# Initialize the clien
client = LlamaStackClient(base_url="http://localhost:8321")
# Create a chat completion reques
response = client.inference.chat_completion(
# Create a chat completion request
response = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a friendly assistant."},
{"role": "user", "content": "Write a two-sentence poem about llama."},
],
model_id=INFERENCE_MODEL,
model=INFERENCE_MODEL,
)
# Print the response
print(response.completion_message.content)
print(response.choices[0].message.content)
```
### 3. Run the Python Script