chore: unpublish /inference/chat-completion

This commit is contained in:
Matthew Farrellee 2025-09-30 09:27:23 -04:00
parent 6cce553c93
commit b0e161d3db
23 changed files with 1448 additions and 2137 deletions

View file

@ -43,10 +43,21 @@ inference chat-completion \
--model-id meta-llama/$MODEL \ --model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models" --message "write a haiku for meta's llama 4 models"
ChatCompletionResponse( OpenAIChatCompletion(
completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]), ...
logprobs=None, choices=[
metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)] OpenAIChatCompletionChoice(
finish_reason='stop',
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role='assistant',
content='...**Silent minds awaken,** \n**Whispers of billions of words,** \n**Reasoning breaks the night.** \n\n— \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
...
),
...
)
],
...
) )
``` ```
### Python SDK ### Python SDK
@ -59,14 +70,14 @@ model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
prompt = "Write a haiku about coding" prompt = "Write a haiku about coding"
print(f"User> {prompt}") print(f"User> {prompt}")
response = client.inference.chat_completion( response = client.chat.completions.create(
model_id=model_id, model=model_id,
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
], ],
) )
print(f"Assistant> {response.completion_message.content}") print(f"Assistant> {response.choices[0].message.content}")
``` ```
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned! As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!

View file

@ -44,7 +44,7 @@ The playground provides interactive pages for users to explore Llama Stack API c
**Simple Chat Interface** **Simple Chat Interface**
- Chat directly with Llama models through an intuitive interface - Chat directly with Llama models through an intuitive interface
- Uses the `/inference/chat-completion` streaming API under the hood - Uses the `/chat/completions` streaming API under the hood
- Real-time message streaming for responsive interactions - Real-time message streaming for responsive interactions
- Perfect for testing model capabilities and prompt engineering - Perfect for testing model capabilities and prompt engineering

View file

@ -313,7 +313,7 @@ client = LlamaStackClient(
) )
# All API calls will be automatically traced # All API calls will be automatically traced
response = client.inference.chat_completion( response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct", model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}] messages=[{"role": "user", "content": "Hello!"}]
) )
@ -327,7 +327,7 @@ with tracer.start_as_current_span("custom_operation") as span:
span.set_attribute("user_id", "user123") span.set_attribute("user_id", "user123")
span.set_attribute("operation_type", "chat_completion") span.set_attribute("operation_type", "chat_completion")
response = client.inference.chat_completion( response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct", model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}] messages=[{"role": "user", "content": "Hello!"}]
) )

View file

@ -216,7 +216,6 @@ from llama_stack_client.types import (
Methods: Methods:
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code> - <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
## VectorIo ## VectorIo

View file

@ -543,15 +543,15 @@
"source": [ "source": [
"model_id = \"meta-llama/Llama-3.3-70B-Instruct\"\n", "model_id = \"meta-llama/Llama-3.3-70B-Instruct\"\n",
"\n", "\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" model_id=model_id,\n", " model=model_id,\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n", " ],\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)\n" "print(response.choices[0].message.content)\n"
] ]
}, },
{ {
@ -625,16 +625,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",
@ -691,16 +691,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",
@ -763,9 +763,9 @@
"message = {\"role\": \"user\", \"content\": \"Write me a sonnet about llama\"}\n", "message = {\"role\": \"user\", \"content\": \"Write me a sonnet about llama\"}\n",
"print(f'User> {message[\"content\"]}')\n", "print(f'User> {message[\"content\"]}')\n",
"\n", "\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id=model_id,\n", " model=model_id,\n",
" stream=True, # <-----------\n", " stream=True, # <-----------\n",
")\n", ")\n",
"\n", "\n",
@ -2917,7 +2917,7 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\n", " {\n",
" \"role\": \"user\",\n", " \"role\": \"user\",\n",
@ -2937,11 +2937,11 @@
" ]\n", " ]\n",
" }\n", " }\n",
" ],\n", " ],\n",
" model_id=vision_model_id,\n", " model=vision_model_id,\n",
" stream=False,\n", " stream=False,\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {

View file

@ -577,15 +577,15 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" model_id=model_id,\n", " model=model_id,\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n", " ],\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)\n" "print(response.choices[0].message.content)\n"
] ]
}, },
{ {
@ -673,7 +673,7 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\n", " {\n",
" \"role\": \"user\",\n", " \"role\": \"user\",\n",
@ -693,11 +693,11 @@
" ]\n", " ]\n",
" }\n", " }\n",
" ],\n", " ],\n",
" model_id=model_id,\n", " model=model_id,\n",
" stream=False,\n", " stream=False,\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {
@ -767,16 +767,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",
@ -831,16 +831,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",

View file

@ -608,15 +608,15 @@
"# TODO: update this with a vision model\n", "# TODO: update this with a vision model\n",
"model_id = \"meta-llama/Llama-4-Maverick-17B-128E-Instruct\"\n", "model_id = \"meta-llama/Llama-4-Maverick-17B-128E-Instruct\"\n",
"\n", "\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" model_id=model_id,\n", " model=model_id,\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n", " ],\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)\n" "print(response.choices[0].message.content)\n"
] ]
}, },
{ {
@ -704,7 +704,7 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\n", " {\n",
" \"role\": \"user\",\n", " \"role\": \"user\",\n",
@ -724,11 +724,11 @@
" ]\n", " ]\n",
" }\n", " }\n",
" ],\n", " ],\n",
" model_id=model_id,\n", " model=model_id,\n",
" stream=False,\n", " stream=False,\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {
@ -798,16 +798,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",
@ -862,16 +862,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=model_id,\n", " model=model_id,\n",
" )\n", " )\n",
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n", "\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"assistant\", # was user\n", " \"role\": \"assistant\", # was user\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" \"stop_reason\": response.completion_message.stop_reason,\n", " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
"\n", "\n",

View file

@ -3615,7 +3615,7 @@
"from rich.pretty import pprint\n", "from rich.pretty import pprint\n",
"\n", "\n",
"response = client.models.register(\n", "response = client.models.register(\n",
" model_id=\"meta-llama/Llama-3.2-3B-Instruct\",\n", " model=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
" provider_id=\"ollama\",\n", " provider_id=\"ollama\",\n",
" provider_model_id=\"llama3.2:3b\",\n", " provider_model_id=\"llama3.2:3b\",\n",
" # base model id\n", " # base model id\n",
@ -5762,7 +5762,7 @@
"source": [ "source": [
"response = client.models.register(\n", "response = client.models.register(\n",
" # the model id here needs to be the finetuned checkpoint identifier\n", " # the model id here needs to be the finetuned checkpoint identifier\n",
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n", " model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
" provider_id=\"ollama\",\n", " provider_id=\"ollama\",\n",
" provider_model_id=\"llama_3_2_finetuned:latest\",\n", " provider_model_id=\"llama_3_2_finetuned:latest\",\n",
" # base model id\n", " # base model id\n",
@ -5816,14 +5816,14 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n", " model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"user\", \"content\": \"What is the primary purpose of a W-2 form in relation to income tax?\"}\n", " {\"role\": \"user\", \"content\": \"What is the primary purpose of a W-2 form in relation to income tax?\"}\n",
" ],\n", " ],\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {

View file

@ -1003,7 +1003,7 @@
"source": [ "source": [
"# register 405B as LLM Judge model\n", "# register 405B as LLM Judge model\n",
"client.models.register(\n", "client.models.register(\n",
" model_id=\"meta-llama/Llama-3.1-405B-Instruct\",\n", " model=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
" provider_model_id=\"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\",\n", " provider_model_id=\"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\",\n",
" provider_id=\"together\",\n", " provider_id=\"together\",\n",
")\n", ")\n",

View file

@ -419,21 +419,15 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Test inference\n", "# Test inference\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"user\", \"content\": sample_prompt}\n", " {\"role\": \"user\", \"content\": sample_prompt}\n",
" ],\n", " ],\n",
" model_id=BASE_MODEL,\n", " model=BASE_MODEL,\n",
" sampling_params={\n", " max_tokens=20,\n",
" \"max_tokens\": 20,\n", " temperature=0.7,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" }\n",
" }\n",
")\n", ")\n",
"print(f\"Inference response: {response.completion_message.content}\")" "print(f\"Inference response: {response.choices[0].message.content}\")"
] ]
}, },
{ {
@ -945,20 +939,14 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Test inference\n", "# Test inference\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=sample_messages,\n", " messages=sample_messages,\n",
" model_id=BASE_MODEL,\n", " model=BASE_MODEL,\n",
" sampling_params={\n", " max_tokens=20,\n",
" \"max_tokens\": 20,\n", " temperature=0.7,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" }\n",
" }\n",
")\n", ")\n",
"assert response.completion_message.content is not None\n", "assert response.choices[0].message.content is not None\n",
"print(f\"Inference response: {response.completion_message.content}\")" "print(f\"Inference response: {response.choices[0].message.content}\")"
] ]
}, },
{ {
@ -1438,15 +1426,13 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Check inference without guardrails\n", "# Check inference without guardrails\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id=BASE_MODEL,\n", " model=BASE_MODEL,\n",
" sampling_params={\n", " max_tokens=150,\n",
" \"max_tokens\": 150,\n",
" }\n",
")\n", ")\n",
"assert response.completion_message.content is not None\n", "assert response.choices[0].message.content is not None\n",
"print(f\"Inference response: {response.completion_message.content}\")" "print(f\"Inference response: {response.choices[0].message.content}\")"
] ]
}, },
{ {

View file

@ -687,23 +687,17 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"completion = client.inference.chat_completion(\n", "completion = client.chat.completions.create(\n",
" model_id=CUSTOMIZED_MODEL,\n", " model=CUSTOMIZED_MODEL,\n",
" messages=test_sample[\"messages\"],\n", " messages=test_sample[\"messages\"],\n",
" tools=test_sample[\"tools\"],\n", " tools=test_sample[\"tools\"],\n",
" tool_choice=\"auto\",\n", " tool_choice=\"auto\",\n",
" stream=False,\n", " stream=False,\n",
" sampling_params={\n", " max_tokens=512,\n",
" \"max_tokens\": 512,\n", " temperature=0.1,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.1,\n",
" \"top_p\": 0.7,\n",
" }\n",
" },\n",
")\n", ")\n",
"\n", "\n",
"completion.completion_message.tool_calls" "completion.choices[0].message.tool_calls"
] ]
}, },
{ {

View file

@ -423,42 +423,30 @@
" violation = self.check_guardrails(user_message.get(\"content\"))\n", " violation = self.check_guardrails(user_message.get(\"content\"))\n",
" \n", " \n",
" if violation is None:\n", " if violation is None:\n",
" completion = client.inference.chat_completion(\n", " completion = client.chat.completions.create(\n",
" model_id=self.customized_model,\n", " model=self.customized_model,\n",
" messages=[user_message],\n", " messages=[user_message],\n",
" tools=tools,\n", " tools=tools,\n",
" tool_choice=\"auto\",\n", " tool_choice=\"auto\",\n",
" stream=False,\n", " stream=False,\n",
" sampling_params={\n", " max_tokens=1024,\n",
" \"max_tokens\": 1024,\n", " temperature=0.2,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"top_p\": 0.7,\n",
" \"temperature\": 0.2\n",
" }\n",
" }\n",
" )\n", " )\n",
" return completion.completion_message\n", " return completion.choices[0].message.content\n",
" else:\n", " else:\n",
" return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n", " return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n",
" \n", " \n",
" elif self.guardrails == \"OFF\":\n", " elif self.guardrails == \"OFF\":\n",
" completion = client.inference.chat_completion(\n", " completion = client.chat.completions.create(\n",
" model_id=self.customized_model,\n", " model=self.customized_model,\n",
" messages=[user_message],\n", " messages=[user_message],\n",
" tools=tools,\n", " tools=tools,\n",
" tool_choice=\"auto\",\n", " tool_choice=\"auto\",\n",
" stream=False,\n", " stream=False,\n",
" sampling_params={\n", " max_tokens=1024,\n",
" \"max_tokens\": 1024,\n", " temperature=0.2,\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"top_p\": 0.7,\n",
" \"temperature\": 0.2\n",
" }\n",
" }\n",
" )\n", " )\n",
" return completion.completion_message" " return completion.choices[0].message.content"
] ]
}, },
{ {

View file

@ -60,7 +60,7 @@ client = LlamaStackClient(
base_url="http://localhost:8321" base_url="http://localhost:8321"
) )
response = client.inference.chat_completion( response = client.chat.completions.create(
model="Llama3.2-3B-Instruct", model="Llama3.2-3B-Instruct",
messages=[{ messages=[{
"role": "user", "role": "user",

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -102,15 +102,15 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n", " ],\n",
" model_id=MODEL_NAME,\n", " model=MODEL_NAME,\n",
")\n", ")\n",
"\n", "\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {
@ -141,14 +141,14 @@
} }
], ],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n", " {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n", " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n", " ],\n",
" model_id=MODEL_NAME, # Changed from model to model_id\n", " model=MODEL_NAME,\n",
")\n", ")\n",
"print(response.completion_message.content)" "print(response.choices[0].message.content)"
] ]
}, },
{ {
@ -218,11 +218,11 @@
" break\n", " break\n",
"\n", "\n",
" message = {\"role\": \"user\", \"content\": user_input}\n", " message = {\"role\": \"user\", \"content\": user_input}\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id=MODEL_NAME\n", " model=MODEL_NAME\n",
" )\n", " )\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n", "\n",
"# Run the chat loop in a Jupyter Notebook cell using await\n", "# Run the chat loop in a Jupyter Notebook cell using await\n",
"await chat_loop()\n", "await chat_loop()\n",
@ -288,16 +288,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n", " user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n", " conversation_history.append(user_message)\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=conversation_history,\n", " messages=conversation_history,\n",
" model_id=MODEL_NAME,\n", " model=MODEL_NAME,\n",
" )\n", " )\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n", "\n",
" # Append the assistant message with all required fields\n", " # Append the assistant message with all required fields\n",
" assistant_message = {\n", " assistant_message = {\n",
" \"role\": \"user\",\n", " \"role\": \"user\",\n",
" \"content\": response.completion_message.content,\n", " \"content\": response.choices[0].message.content,\n",
" # Add any additional required fields here if necessary\n", " # Add any additional required fields here if necessary\n",
" }\n", " }\n",
" conversation_history.append(assistant_message)\n", " conversation_history.append(assistant_message)\n",
@ -349,14 +349,14 @@
" }\n", " }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n", " cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n", "\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id=MODEL_NAME,\n", " model=MODEL_NAME,\n",
" stream=stream,\n", " stream=stream,\n",
" )\n", " )\n",
"\n", "\n",
" if not stream:\n", " if not stream:\n",
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n", " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
" else:\n", " else:\n",
" for log in EventLogger().log(response):\n", " for log in EventLogger().log(response):\n",
" log.print()\n", " log.print()\n",

View file

@ -134,15 +134,15 @@
" }\n", " }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n", " cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n", "\n",
" response = await client.inference.chat_completion(\n", " response = await client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id='meta-llama/Llama3.2-11B-Vision-Instruct',\n", " model='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
" stream=stream,\n", " stream=stream,\n",
" )\n", " )\n",
"\n", "\n",
" cprint(f'Assistant> ', color='cyan', end='')\n", " cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n", " if not stream:\n",
" cprint(response.completion_message.content, color='yellow')\n", " cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n", " else:\n",
" async for chunk in response:\n", " async for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n", " cprint(chunk.event.delta.text, color='yellow', end='')\n",

View file

@ -152,8 +152,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=few_shot_examples, model_id=MODEL_NAME\n", " messages=few_shot_examples, model=MODEL_NAME\n",
")" ")"
] ]
}, },
@ -164,7 +164,7 @@
"source": [ "source": [
"#### 4. Display the Models Response\n", "#### 4. Display the Models Response\n",
"\n", "\n",
"The `completion_message` contains the assistants generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n" "The `choices[0].message.content` contains the assistants generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
] ]
}, },
{ {
@ -184,7 +184,7 @@
"source": [ "source": [
"from termcolor import cprint\n", "from termcolor import cprint\n",
"\n", "\n",
"cprint(f'> Response: {response.completion_message.content}', 'cyan')" "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
] ]
}, },
{ {
@ -219,7 +219,7 @@
"\n", "\n",
"client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n", "client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n",
"\n", "\n",
"response = client.inference.chat_completion(\n", "response = client.chat.completions.create(\n",
" messages=[\n", " messages=[\n",
" {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n", " {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n",
" {\n", " {\n",
@ -253,10 +253,10 @@
" \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n", " \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n",
" }\n", " }\n",
"],\n", "],\n",
" model_id=MODEL_NAME,\n", " model=MODEL_NAME,\n",
")\n", ")\n",
"\n", "\n",
"cprint(f'> Response: {response.completion_message.content}', 'cyan')" "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
] ]
}, },
{ {

View file

@ -102,15 +102,15 @@
" }\n", " }\n",
"\n", "\n",
" cprint(\"User> Sending image for analysis...\", \"green\")\n", " cprint(\"User> Sending image for analysis...\", \"green\")\n",
" response = client.inference.chat_completion(\n", " response = client.chat.completions.create(\n",
" messages=[message],\n", " messages=[message],\n",
" model_id=MODEL_NAME,\n", " model=MODEL_NAME,\n",
" stream=stream,\n", " stream=stream,\n",
" )\n", " )\n",
"\n", "\n",
" cprint(f'Assistant> ', color='cyan', end='')\n", " cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n", " if not stream:\n",
" cprint(response.completion_message.content, color='yellow')\n", " cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n", " else:\n",
" for chunk in response:\n", " for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n", " cprint(chunk.event.delta.text, color='yellow', end='')\n",

View file

@ -131,14 +131,37 @@ After setting up the server, open a new terminal window and configure the llama-
``` ```
**Expected Output:** **Expected Output:**
```bash ```bash
ChatCompletionResponse( OpenAIChatCompletion(
completion_message=CompletionMessage( id='chatcmpl-950',
content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.', choices=[
role='assistant', OpenAIChatCompletionChoice(
stop_reason='end_of_turn', finish_reason='stop',
tool_calls=[] index=0,
), message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
logprobs=None role='assistant',
content='...The moon casts silver threads through the velvet night, a silent bard of shadows, ancient and bright.',
name=None,
tool_calls=None,
refusal=None,
annotations=None,
audio=None,
function_call=None
),
logprobs=None
)
],
created=1759240813,
model='meta-llama/Llama-3.2-3B-Instruct',
object='chat.completion',
service_tier=None,
system_fingerprint='fp_ollama',
usage={
'completion_tokens': 479,
'prompt_tokens': 19,
'total_tokens': 498,
'completion_tokens_details': None,
'prompt_tokens_details': None
},
) )
``` ```
@ -147,21 +170,16 @@ After setting up the server, open a new terminal window and configure the llama-
After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`: After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
```bash ```bash
curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion curl http://localhost:$LLAMA_STACK_PORT/v1/chat/completions
-H "Content-Type: application/json" -H "Content-Type: application/json"
-d @- <<EOF -d @- <<EOF
{ {
"model_id": "$INFERENCE_MODEL", "model": "$INFERENCE_MODEL",
"messages": [ "messages": [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write me a 2-sentence poem about the moon"} {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
], ],
"sampling_params": { "temperature": 0.7,
"strategy": {
"type": "top_p",
"temperatrue": 0.7,
"top_p": 0.95,
},
"seed": 42, "seed": 42,
"max_tokens": 512 "max_tokens": 512
} }
@ -174,13 +192,9 @@ You can check the available models with the command `uv run --with llama-stack-c
**Expected Output:** **Expected Output:**
```json ```json
{ {
"completion_message": { ...
"role": "assistant", "content": "... The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
"content": "The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.", ...
"stop_reason": "out_of_tokens",
"tool_calls": []
},
"logprobs": null
} }
``` ```
@ -213,17 +227,17 @@ if INFERENCE_MODEL is None:
# Initialize the clien # Initialize the clien
client = LlamaStackClient(base_url="http://localhost:8321") client = LlamaStackClient(base_url="http://localhost:8321")
# Create a chat completion reques # Create a chat completion request
response = client.inference.chat_completion( response = client.chat.completions.create(
messages=[ messages=[
{"role": "system", "content": "You are a friendly assistant."}, {"role": "system", "content": "You are a friendly assistant."},
{"role": "user", "content": "Write a two-sentence poem about llama."}, {"role": "user", "content": "Write a two-sentence poem about llama."},
], ],
model_id=INFERENCE_MODEL, model=INFERENCE_MODEL,
) )
# Print the response # Print the response
print(response.completion_message.content) print(response.choices[0].message.content)
``` ```
### 3. Run the Python Script ### 3. Run the Python Script

View file

@ -1030,7 +1030,6 @@ class InferenceProvider(Protocol):
""" """
... ...
@webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
async def chat_completion( async def chat_completion(
self, self,
model_id: str, model_id: str,

View file

@ -44,8 +44,8 @@ client.initialize()
The following example shows how to create a chat completion for an NVIDIA NIM. The following example shows how to create a chat completion for an NVIDIA NIM.
```python ```python
response = client.inference.chat_completion( response = client.chat.completions.create(
model_id="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
messages=[ messages=[
{ {
"role": "system", "role": "system",
@ -57,11 +57,9 @@ response = client.inference.chat_completion(
}, },
], ],
stream=False, stream=False,
sampling_params={ max_tokens=50,
"max_tokens": 50,
},
) )
print(f"Response: {response.completion_message.content}") print(f"Response: {response.choices[0].message.content}")
``` ```
### Tool Calling Example ### ### Tool Calling Example ###
@ -89,15 +87,15 @@ tool_definition = ToolDefinition(
}, },
) )
tool_response = client.inference.chat_completion( tool_response = client.chat.completions.create(
model_id="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=[tool_definition], tools=[tool_definition],
) )
print(f"Tool Response: {tool_response.completion_message.content}") print(f"Tool Response: {tool_response.choices[0].message.content}")
if tool_response.completion_message.tool_calls: if tool_response.choices[0].message.tool_calls:
for tool_call in tool_response.completion_message.tool_calls: for tool_call in tool_response.choices[0].message.tool_calls:
print(f"Tool Called: {tool_call.tool_name}") print(f"Tool Called: {tool_call.tool_name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
``` ```
@ -123,8 +121,8 @@ response_format = JsonSchemaResponseFormat(
type=ResponseFormatType.json_schema, json_schema=person_schema type=ResponseFormatType.json_schema, json_schema=person_schema
) )
structured_response = client.inference.chat_completion( structured_response = client.chat.completions.create(
model_id="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
messages=[ messages=[
{ {
"role": "user", "role": "user",
@ -134,7 +132,7 @@ structured_response = client.inference.chat_completion(
response_format=response_format, response_format=response_format,
) )
print(f"Structured Response: {structured_response.completion_message.content}") print(f"Structured Response: {structured_response.choices[0].message.content}")
``` ```
### Create Embeddings ### Create Embeddings
@ -167,8 +165,8 @@ def load_image_as_base64(image_path):
image_path = {path_to_the_image} image_path = {path_to_the_image}
demo_image_b64 = load_image_as_base64(image_path) demo_image_b64 = load_image_as_base64(image_path)
vlm_response = client.inference.chat_completion( vlm_response = client.chat.completions.create(
model_id="nvidia/vila", model="nvidia/vila",
messages=[ messages=[
{ {
"role": "user", "role": "user",
@ -188,5 +186,5 @@ vlm_response = client.inference.chat_completion(
], ],
) )
print(f"VLM Response: {vlm_response.completion_message.content}") print(f"VLM Response: {vlm_response.choices[0].message.content}")
``` ```

View file

@ -179,15 +179,15 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.
### Basic Test Pattern ### Basic Test Pattern
```python ```python
def test_basic_chat_completion(llama_stack_client, text_model_id): def test_basic_chat_completion(llama_stack_client, text_model_id):
response = llama_stack_client.inference.chat_completion( response = llama_stack_client.chat.completions.create(
model_id=text_model_id, model=text_model_id,
messages=[{"role": "user", "content": "Hello"}], messages=[{"role": "user", "content": "Hello"}],
) )
# Test structure, not AI output quality # Test structure, not AI output quality
assert response.completion_message is not None assert response.choices[0].message is not None
assert isinstance(response.completion_message.content, str) assert isinstance(response.choices[0].message.content, str)
assert len(response.completion_message.content) > 0 assert len(response.choices[0].message.content) > 0
``` ```
### Provider-Specific Tests ### Provider-Specific Tests