diff --git a/README.md b/README.md
index d6c5b4138..e9003cdb1 100644
--- a/README.md
+++ b/README.md
@@ -43,10 +43,21 @@ inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-ChatCompletionResponse(
- completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
- logprobs=None,
- metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
+OpenAIChatCompletion(
+ ...
+ choices=[
+ OpenAIChatCompletionChoice(
+ finish_reason='stop',
+ index=0,
+ message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+ role='assistant',
+ content='...**Silent minds awaken,** \n**Whispers of billions of words,** \n**Reasoning breaks the night.** \n\n— \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
+ ...
+ ),
+ ...
+ )
+ ],
+ ...
)
```
### Python SDK
@@ -59,14 +70,14 @@ model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
prompt = "Write a haiku about coding"
print(f"User> {prompt}")
-response = client.inference.chat_completion(
- model_id=model_id,
+response = client.chat.completions.create(
+ model=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
)
-print(f"Assistant> {response.completion_message.content}")
+print(f"Assistant> {response.choices[0].message.content}")
```
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
diff --git a/docs/docs/building_applications/playground.mdx b/docs/docs/building_applications/playground.mdx
index b2aa1b4a5..824a2c32b 100644
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@@ -44,7 +44,7 @@ The playground provides interactive pages for users to explore Llama Stack API c
**Simple Chat Interface**
- Chat directly with Llama models through an intuitive interface
-- Uses the `/inference/chat-completion` streaming API under the hood
+- Uses the `/chat/completions` streaming API under the hood
- Real-time message streaming for responsive interactions
- Perfect for testing model capabilities and prompt engineering
diff --git a/docs/docs/building_applications/telemetry.mdx b/docs/docs/building_applications/telemetry.mdx
index 6a255e702..655a2043b 100644
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@@ -313,7 +313,7 @@ client = LlamaStackClient(
)
# All API calls will be automatically traced
-response = client.inference.chat_completion(
+response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}]
)
@@ -327,7 +327,7 @@ with tracer.start_as_current_span("custom_operation") as span:
span.set_attribute("user_id", "user123")
span.set_attribute("operation_type", "chat_completion")
- response = client.inference.chat_completion(
+ response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}]
)
diff --git a/docs/docs/references/python_sdk_reference/index.md b/docs/docs/references/python_sdk_reference/index.md
index bce87e14a..686567458 100644
--- a/docs/docs/references/python_sdk_reference/index.md
+++ b/docs/docs/references/python_sdk_reference/index.md
@@ -216,7 +216,6 @@ from llama_stack_client.types import (
Methods:
-- client.inference.chat_completion(\*\*params) -> InferenceChatCompletionResponse
- client.inference.embeddings(\*\*params) -> EmbeddingsResponse
## VectorIo
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 56aef2b7d..d7d544ad5 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -543,15 +543,15 @@
"source": [
"model_id = \"meta-llama/Llama-3.3-70B-Instruct\"\n",
"\n",
- "response = client.inference.chat_completion(\n",
- " model_id=model_id,\n",
+ "response = client.chat.completions.create(\n",
+ " model=model_id,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n",
")\n",
"\n",
- "print(response.completion_message.content)\n"
+ "print(response.choices[0].message.content)\n"
]
},
{
@@ -625,16 +625,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
@@ -691,16 +691,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
@@ -763,9 +763,9 @@
"message = {\"role\": \"user\", \"content\": \"Write me a sonnet about llama\"}\n",
"print(f'User> {message[\"content\"]}')\n",
"\n",
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" stream=True, # <-----------\n",
")\n",
"\n",
@@ -2917,7 +2917,7 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
@@ -2937,11 +2937,11 @@
" ]\n",
" }\n",
" ],\n",
- " model_id=vision_model_id,\n",
+ " model=vision_model_id,\n",
" stream=False,\n",
")\n",
"\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
diff --git a/docs/getting_started_llama4.ipynb b/docs/getting_started_llama4.ipynb
index 648f4bbef..cd5f83517 100644
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@@ -577,15 +577,15 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
- " model_id=model_id,\n",
+ "response = client.chat.completions.create(\n",
+ " model=model_id,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n",
")\n",
"\n",
- "print(response.completion_message.content)\n"
+ "print(response.choices[0].message.content)\n"
]
},
{
@@ -673,7 +673,7 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
@@ -693,11 +693,11 @@
" ]\n",
" }\n",
" ],\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" stream=False,\n",
")\n",
"\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
@@ -767,16 +767,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
@@ -831,16 +831,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
diff --git a/docs/getting_started_llama_api.ipynb b/docs/getting_started_llama_api.ipynb
index f6a170980..f65566205 100644
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@@ -608,15 +608,15 @@
"# TODO: update this with a vision model\n",
"model_id = \"meta-llama/Llama-4-Maverick-17B-128E-Instruct\"\n",
"\n",
- "response = client.inference.chat_completion(\n",
- " model_id=model_id,\n",
+ "response = client.chat.completions.create(\n",
+ " model=model_id,\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
" ],\n",
")\n",
"\n",
- "print(response.completion_message.content)\n"
+ "print(response.choices[0].message.content)\n"
]
},
{
@@ -704,7 +704,7 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
@@ -724,11 +724,11 @@
" ]\n",
" }\n",
" ],\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" stream=False,\n",
")\n",
"\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
@@ -798,16 +798,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
@@ -862,16 +862,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=model_id,\n",
+ " model=model_id,\n",
" )\n",
- " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
+ " cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
"\n",
" assistant_message = {\n",
" \"role\": \"assistant\", # was user\n",
- " \"content\": response.completion_message.content,\n",
- " \"stop_reason\": response.completion_message.stop_reason,\n",
+ " \"content\": response.choices[0].message.content,\n",
+ " \"stop_reason\": response.choices[0].finish_reason,\n",
" }\n",
" conversation_history.append(assistant_message)\n",
"\n",
diff --git a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
index b5fe0d8d9..96a069f1b 100644
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@@ -3615,7 +3615,7 @@
"from rich.pretty import pprint\n",
"\n",
"response = client.models.register(\n",
- " model_id=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
+ " model=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
" provider_id=\"ollama\",\n",
" provider_model_id=\"llama3.2:3b\",\n",
" # base model id\n",
@@ -5762,7 +5762,7 @@
"source": [
"response = client.models.register(\n",
" # the model id here needs to be the finetuned checkpoint identifier\n",
- " model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
+ " model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
" provider_id=\"ollama\",\n",
" provider_model_id=\"llama_3_2_finetuned:latest\",\n",
" # base model id\n",
@@ -5816,14 +5816,14 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
- " model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": \"What is the primary purpose of a W-2 form in relation to income tax?\"}\n",
" ],\n",
")\n",
"\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 2acb79e5f..228f426d5 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1003,7 +1003,7 @@
"source": [
"# register 405B as LLM Judge model\n",
"client.models.register(\n",
- " model_id=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
+ " model=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
" provider_model_id=\"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\",\n",
" provider_id=\"together\",\n",
")\n",
diff --git a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
index 601276526..674b961c7 100644
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@@ -419,21 +419,15 @@
"outputs": [],
"source": [
"# Test inference\n",
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": sample_prompt}\n",
" ],\n",
- " model_id=BASE_MODEL,\n",
- " sampling_params={\n",
- " \"max_tokens\": 20,\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 0.7,\n",
- " \"top_p\": 0.9\n",
- " }\n",
- " }\n",
+ " model=BASE_MODEL,\n",
+ " max_tokens=20,\n",
+ " temperature=0.7,\n",
")\n",
- "print(f\"Inference response: {response.completion_message.content}\")"
+ "print(f\"Inference response: {response.choices[0].message.content}\")"
]
},
{
@@ -945,20 +939,14 @@
"outputs": [],
"source": [
"# Test inference\n",
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=sample_messages,\n",
- " model_id=BASE_MODEL,\n",
- " sampling_params={\n",
- " \"max_tokens\": 20,\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 0.7,\n",
- " \"top_p\": 0.9\n",
- " }\n",
- " }\n",
+ " model=BASE_MODEL,\n",
+ " max_tokens=20,\n",
+ " temperature=0.7,\n",
")\n",
- "assert response.completion_message.content is not None\n",
- "print(f\"Inference response: {response.completion_message.content}\")"
+ "assert response.choices[0].message.content is not None\n",
+ "print(f\"Inference response: {response.choices[0].message.content}\")"
]
},
{
@@ -1438,15 +1426,13 @@
"outputs": [],
"source": [
"# Check inference without guardrails\n",
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id=BASE_MODEL,\n",
- " sampling_params={\n",
- " \"max_tokens\": 150,\n",
- " }\n",
+ " model=BASE_MODEL,\n",
+ " max_tokens=150,\n",
")\n",
- "assert response.completion_message.content is not None\n",
- "print(f\"Inference response: {response.completion_message.content}\")"
+ "assert response.choices[0].message.content is not None\n",
+ "print(f\"Inference response: {response.choices[0].message.content}\")"
]
},
{
diff --git a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
index 0e69cafd5..7ab94a281 100644
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@@ -687,23 +687,17 @@
"metadata": {},
"outputs": [],
"source": [
- "completion = client.inference.chat_completion(\n",
- " model_id=CUSTOMIZED_MODEL,\n",
+ "completion = client.chat.completions.create(\n",
+ " model=CUSTOMIZED_MODEL,\n",
" messages=test_sample[\"messages\"],\n",
" tools=test_sample[\"tools\"],\n",
" tool_choice=\"auto\",\n",
" stream=False,\n",
- " sampling_params={\n",
- " \"max_tokens\": 512,\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"temperature\": 0.1,\n",
- " \"top_p\": 0.7,\n",
- " }\n",
- " },\n",
+ " max_tokens=512,\n",
+ " temperature=0.1,\n",
")\n",
"\n",
- "completion.completion_message.tool_calls"
+ "completion.choices[0].message.tool_calls"
]
},
{
diff --git a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
index 25bcd0b69..1c8538634 100644
--- a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
@@ -423,42 +423,30 @@
" violation = self.check_guardrails(user_message.get(\"content\"))\n",
" \n",
" if violation is None:\n",
- " completion = client.inference.chat_completion(\n",
- " model_id=self.customized_model,\n",
+ " completion = client.chat.completions.create(\n",
+ " model=self.customized_model,\n",
" messages=[user_message],\n",
" tools=tools,\n",
" tool_choice=\"auto\",\n",
" stream=False,\n",
- " sampling_params={\n",
- " \"max_tokens\": 1024,\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"top_p\": 0.7,\n",
- " \"temperature\": 0.2\n",
- " }\n",
- " }\n",
+ " max_tokens=1024,\n",
+ " temperature=0.2,\n",
" )\n",
- " return completion.completion_message\n",
+ " return completion.choices[0].message.content\n",
" else:\n",
" return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n",
" \n",
" elif self.guardrails == \"OFF\":\n",
- " completion = client.inference.chat_completion(\n",
- " model_id=self.customized_model,\n",
+ " completion = client.chat.completions.create(\n",
+ " model=self.customized_model,\n",
" messages=[user_message],\n",
" tools=tools,\n",
" tool_choice=\"auto\",\n",
" stream=False,\n",
- " sampling_params={\n",
- " \"max_tokens\": 1024,\n",
- " \"strategy\": {\n",
- " \"type\": \"top_p\",\n",
- " \"top_p\": 0.7,\n",
- " \"temperature\": 0.2\n",
- " }\n",
- " }\n",
+ " max_tokens=1024,\n",
+ " temperature=0.2,\n",
" )\n",
- " return completion.completion_message"
+ " return completion.choices[0].message.content"
]
},
{
diff --git a/docs/src/pages/index.js b/docs/src/pages/index.js
index c97959d77..b49d75dbc 100644
--- a/docs/src/pages/index.js
+++ b/docs/src/pages/index.js
@@ -60,7 +60,7 @@ client = LlamaStackClient(
base_url="http://localhost:8321"
)
-response = client.inference.chat_completion(
+response = client.chat.completions.create(
model="Llama3.2-3B-Instruct",
messages=[{
"role": "user",
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index 01b316069..d46e54011 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -161,55 +161,6 @@
}
}
},
- "/v1/inference/chat-completion": {
- "post": {
- "responses": {
- "200": {
- "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.",
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ChatCompletionResponse"
- }
- },
- "text/event-stream": {
- "schema": {
- "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
- }
- }
- }
- },
- "400": {
- "$ref": "#/components/responses/BadRequest400"
- },
- "429": {
- "$ref": "#/components/responses/TooManyRequests429"
- },
- "500": {
- "$ref": "#/components/responses/InternalServerError500"
- },
- "default": {
- "$ref": "#/components/responses/DefaultError"
- }
- },
- "tags": [
- "Inference"
- ],
- "summary": "Generate a chat completion for the given messages using the specified model.",
- "description": "Generate a chat completion for the given messages using the specified model.",
- "parameters": [],
- "requestBody": {
- "content": {
- "application/json": {
- "schema": {
- "$ref": "#/components/schemas/ChatCompletionRequest"
- }
- }
- },
- "required": true
- }
- }
- },
"/v1/agents": {
"get": {
"responses": {
@@ -6126,1052 +6077,6 @@
],
"title": "CancelTrainingJobRequest"
},
- "CompletionMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "assistant",
- "default": "assistant",
- "description": "Must be \"assistant\" to identify this as the model's response"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the model's response"
- },
- "stop_reason": {
- "type": "string",
- "enum": [
- "end_of_turn",
- "end_of_message",
- "out_of_tokens"
- ],
- "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
- },
- "tool_calls": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolCall"
- },
- "description": "List of tool calls. Each tool call is a ToolCall object."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content",
- "stop_reason"
- ],
- "title": "CompletionMessage",
- "description": "A message containing the model's (assistant) response in a chat conversation."
- },
- "GrammarResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "enum": [
- "json_schema",
- "grammar"
- ],
- "description": "Must be \"grammar\" to identify this format type",
- "const": "grammar",
- "default": "grammar"
- },
- "bnf": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The BNF grammar specification the response should conform to"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "bnf"
- ],
- "title": "GrammarResponseFormat",
- "description": "Configuration for grammar-guided response generation."
- },
- "GreedySamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "greedy",
- "default": "greedy",
- "description": "Must be \"greedy\" to identify this sampling strategy"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "GreedySamplingStrategy",
- "description": "Greedy sampling strategy that selects the highest probability token at each step."
- },
- "ImageContentItem": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "image",
- "default": "image",
- "description": "Discriminator type of the content item. Always \"image\""
- },
- "image": {
- "type": "object",
- "properties": {
- "url": {
- "$ref": "#/components/schemas/URL",
- "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
- },
- "data": {
- "type": "string",
- "contentEncoding": "base64",
- "description": "base64 encoded image data as string"
- }
- },
- "additionalProperties": false,
- "description": "Image as a base64 encoded string or an URL"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "image"
- ],
- "title": "ImageContentItem",
- "description": "A image content item"
- },
- "InterleavedContent": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/InterleavedContentItem"
- },
- {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/InterleavedContentItem"
- }
- }
- ]
- },
- "InterleavedContentItem": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/ImageContentItem"
- },
- {
- "$ref": "#/components/schemas/TextContentItem"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "image": "#/components/schemas/ImageContentItem",
- "text": "#/components/schemas/TextContentItem"
- }
- }
- },
- "JsonSchemaResponseFormat": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "enum": [
- "json_schema",
- "grammar"
- ],
- "description": "Must be \"json_schema\" to identify this format type",
- "const": "json_schema",
- "default": "json_schema"
- },
- "json_schema": {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "json_schema"
- ],
- "title": "JsonSchemaResponseFormat",
- "description": "Configuration for JSON schema-guided response generation."
- },
- "Message": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/UserMessage"
- },
- {
- "$ref": "#/components/schemas/SystemMessage"
- },
- {
- "$ref": "#/components/schemas/ToolResponseMessage"
- },
- {
- "$ref": "#/components/schemas/CompletionMessage"
- }
- ],
- "discriminator": {
- "propertyName": "role",
- "mapping": {
- "user": "#/components/schemas/UserMessage",
- "system": "#/components/schemas/SystemMessage",
- "tool": "#/components/schemas/ToolResponseMessage",
- "assistant": "#/components/schemas/CompletionMessage"
- }
- }
- },
- "ResponseFormat": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/JsonSchemaResponseFormat"
- },
- {
- "$ref": "#/components/schemas/GrammarResponseFormat"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
- "grammar": "#/components/schemas/GrammarResponseFormat"
- }
- }
- },
- "SamplingParams": {
- "type": "object",
- "properties": {
- "strategy": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/GreedySamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopPSamplingStrategy"
- },
- {
- "$ref": "#/components/schemas/TopKSamplingStrategy"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "greedy": "#/components/schemas/GreedySamplingStrategy",
- "top_p": "#/components/schemas/TopPSamplingStrategy",
- "top_k": "#/components/schemas/TopKSamplingStrategy"
- }
- },
- "description": "The sampling strategy."
- },
- "max_tokens": {
- "type": "integer",
- "default": 0,
- "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
- },
- "repetition_penalty": {
- "type": "number",
- "default": 1.0,
- "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
- },
- "stop": {
- "type": "array",
- "items": {
- "type": "string"
- },
- "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
- }
- },
- "additionalProperties": false,
- "required": [
- "strategy"
- ],
- "title": "SamplingParams",
- "description": "Sampling parameters."
- },
- "SystemMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "system",
- "default": "system",
- "description": "Must be \"system\" to identify this as a system message"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content"
- ],
- "title": "SystemMessage",
- "description": "A system message providing instructions or context to the model."
- },
- "TextContentItem": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "text",
- "default": "text",
- "description": "Discriminator type of the content item. Always \"text\""
- },
- "text": {
- "type": "string",
- "description": "Text content"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "text"
- ],
- "title": "TextContentItem",
- "description": "A text content item"
- },
- "ToolCall": {
- "type": "object",
- "properties": {
- "call_id": {
- "type": "string"
- },
- "tool_name": {
- "oneOf": [
- {
- "type": "string",
- "enum": [
- "brave_search",
- "wolfram_alpha",
- "photogen",
- "code_interpreter"
- ],
- "title": "BuiltinTool"
- },
- {
- "type": "string"
- }
- ]
- },
- "arguments": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- },
- {
- "type": "array",
- "items": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- }
- ]
- }
- },
- {
- "type": "object",
- "additionalProperties": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "type": "integer"
- },
- {
- "type": "number"
- },
- {
- "type": "boolean"
- },
- {
- "type": "null"
- }
- ]
- }
- }
- ]
- }
- }
- ]
- },
- "arguments_json": {
- "type": "string"
- }
- },
- "additionalProperties": false,
- "required": [
- "call_id",
- "tool_name",
- "arguments"
- ],
- "title": "ToolCall"
- },
- "ToolConfig": {
- "type": "object",
- "properties": {
- "tool_choice": {
- "oneOf": [
- {
- "type": "string",
- "enum": [
- "auto",
- "required",
- "none"
- ],
- "title": "ToolChoice",
- "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
- },
- {
- "type": "string"
- }
- ],
- "default": "auto",
- "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
- },
- "tool_prompt_format": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
- },
- "system_message_behavior": {
- "type": "string",
- "enum": [
- "append",
- "replace"
- ],
- "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
- "default": "append"
- }
- },
- "additionalProperties": false,
- "title": "ToolConfig",
- "description": "Configuration for tool use."
- },
- "ToolDefinition": {
- "type": "object",
- "properties": {
- "tool_name": {
- "oneOf": [
- {
- "type": "string",
- "enum": [
- "brave_search",
- "wolfram_alpha",
- "photogen",
- "code_interpreter"
- ],
- "title": "BuiltinTool"
- },
- {
- "type": "string"
- }
- ]
- },
- "description": {
- "type": "string"
- },
- "parameters": {
- "type": "object",
- "additionalProperties": {
- "$ref": "#/components/schemas/ToolParamDefinition"
- }
- }
- },
- "additionalProperties": false,
- "required": [
- "tool_name"
- ],
- "title": "ToolDefinition"
- },
- "ToolParamDefinition": {
- "type": "object",
- "properties": {
- "param_type": {
- "type": "string"
- },
- "description": {
- "type": "string"
- },
- "required": {
- "type": "boolean",
- "default": true
- },
- "items": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- },
- "title": {
- "type": "string"
- },
- "default": {
- "oneOf": [
- {
- "type": "null"
- },
- {
- "type": "boolean"
- },
- {
- "type": "number"
- },
- {
- "type": "string"
- },
- {
- "type": "array"
- },
- {
- "type": "object"
- }
- ]
- }
- },
- "additionalProperties": false,
- "required": [
- "param_type"
- ],
- "title": "ToolParamDefinition"
- },
- "ToolResponseMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "tool",
- "default": "tool",
- "description": "Must be \"tool\" to identify this as a tool response"
- },
- "call_id": {
- "type": "string",
- "description": "Unique identifier for the tool call this response is for"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The response content from the tool"
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "call_id",
- "content"
- ],
- "title": "ToolResponseMessage",
- "description": "A message representing the result of a tool invocation."
- },
- "TopKSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_k",
- "default": "top_k",
- "description": "Must be \"top_k\" to identify this sampling strategy"
- },
- "top_k": {
- "type": "integer",
- "description": "Number of top tokens to consider for sampling. Must be at least 1"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "top_k"
- ],
- "title": "TopKSamplingStrategy",
- "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
- },
- "TopPSamplingStrategy": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "top_p",
- "default": "top_p",
- "description": "Must be \"top_p\" to identify this sampling strategy"
- },
- "temperature": {
- "type": "number",
- "description": "Controls randomness in sampling. Higher values increase randomness"
- },
- "top_p": {
- "type": "number",
- "default": 0.95,
- "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ],
- "title": "TopPSamplingStrategy",
- "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
- },
- "URL": {
- "type": "object",
- "properties": {
- "uri": {
- "type": "string",
- "description": "The URL string pointing to the resource"
- }
- },
- "additionalProperties": false,
- "required": [
- "uri"
- ],
- "title": "URL",
- "description": "A URL reference to external content."
- },
- "UserMessage": {
- "type": "object",
- "properties": {
- "role": {
- "type": "string",
- "const": "user",
- "default": "user",
- "description": "Must be \"user\" to identify this as a user message"
- },
- "content": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "The content of the message, which can include text and other media"
- },
- "context": {
- "$ref": "#/components/schemas/InterleavedContent",
- "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
- }
- },
- "additionalProperties": false,
- "required": [
- "role",
- "content"
- ],
- "title": "UserMessage",
- "description": "A message from the user in a chat conversation."
- },
- "ChatCompletionRequest": {
- "type": "object",
- "properties": {
- "model_id": {
- "type": "string",
- "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
- },
- "messages": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/Message"
- },
- "description": "List of messages in the conversation."
- },
- "sampling_params": {
- "$ref": "#/components/schemas/SamplingParams",
- "description": "Parameters to control the sampling strategy."
- },
- "tools": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/ToolDefinition"
- },
- "description": "(Optional) List of tool definitions available to the model."
- },
- "tool_choice": {
- "type": "string",
- "enum": [
- "auto",
- "required",
- "none"
- ],
- "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead."
- },
- "tool_prompt_format": {
- "type": "string",
- "enum": [
- "json",
- "function_tag",
- "python_list"
- ],
- "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls. .. deprecated:: Use tool_config instead."
- },
- "response_format": {
- "$ref": "#/components/schemas/ResponseFormat",
- "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
- },
- "stream": {
- "type": "boolean",
- "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
- },
- "logprobs": {
- "type": "object",
- "properties": {
- "top_k": {
- "type": "integer",
- "default": 0,
- "description": "How many tokens (for each position) to return log probabilities for."
- }
- },
- "additionalProperties": false,
- "description": "(Optional) If specified, log probabilities for each token position will be returned."
- },
- "tool_config": {
- "$ref": "#/components/schemas/ToolConfig",
- "description": "(Optional) Configuration for tool use."
- }
- },
- "additionalProperties": false,
- "required": [
- "model_id",
- "messages"
- ],
- "title": "ChatCompletionRequest"
- },
- "ChatCompletionResponse": {
- "type": "object",
- "properties": {
- "metrics": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/MetricInResponse"
- },
- "description": "(Optional) List of metrics associated with the API response"
- },
- "completion_message": {
- "$ref": "#/components/schemas/CompletionMessage",
- "description": "The complete response message"
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- },
- "description": "Optional log probabilities for generated tokens"
- }
- },
- "additionalProperties": false,
- "required": [
- "completion_message"
- ],
- "title": "ChatCompletionResponse",
- "description": "Response from a chat completion request."
- },
- "MetricInResponse": {
- "type": "object",
- "properties": {
- "metric": {
- "type": "string",
- "description": "The name of the metric"
- },
- "value": {
- "oneOf": [
- {
- "type": "integer"
- },
- {
- "type": "number"
- }
- ],
- "description": "The numeric value of the metric"
- },
- "unit": {
- "type": "string",
- "description": "(Optional) The unit of measurement for the metric value"
- }
- },
- "additionalProperties": false,
- "required": [
- "metric",
- "value"
- ],
- "title": "MetricInResponse",
- "description": "A metric value included in API responses."
- },
- "TokenLogProbs": {
- "type": "object",
- "properties": {
- "logprobs_by_token": {
- "type": "object",
- "additionalProperties": {
- "type": "number"
- },
- "description": "Dictionary mapping tokens to their log probabilities"
- }
- },
- "additionalProperties": false,
- "required": [
- "logprobs_by_token"
- ],
- "title": "TokenLogProbs",
- "description": "Log probabilities for generated tokens."
- },
- "ChatCompletionResponseEvent": {
- "type": "object",
- "properties": {
- "event_type": {
- "type": "string",
- "enum": [
- "start",
- "complete",
- "progress"
- ],
- "description": "Type of the event"
- },
- "delta": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/TextDelta"
- },
- {
- "$ref": "#/components/schemas/ImageDelta"
- },
- {
- "$ref": "#/components/schemas/ToolCallDelta"
- }
- ],
- "discriminator": {
- "propertyName": "type",
- "mapping": {
- "text": "#/components/schemas/TextDelta",
- "image": "#/components/schemas/ImageDelta",
- "tool_call": "#/components/schemas/ToolCallDelta"
- }
- },
- "description": "Content generated since last event. This can be one or more tokens, or a tool call."
- },
- "logprobs": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/TokenLogProbs"
- },
- "description": "Optional log probabilities for generated tokens"
- },
- "stop_reason": {
- "type": "string",
- "enum": [
- "end_of_turn",
- "end_of_message",
- "out_of_tokens"
- ],
- "description": "Optional reason why generation stopped, if complete"
- }
- },
- "additionalProperties": false,
- "required": [
- "event_type",
- "delta"
- ],
- "title": "ChatCompletionResponseEvent",
- "description": "An event during chat completion generation."
- },
- "ChatCompletionResponseStreamChunk": {
- "type": "object",
- "properties": {
- "metrics": {
- "type": "array",
- "items": {
- "$ref": "#/components/schemas/MetricInResponse"
- },
- "description": "(Optional) List of metrics associated with the API response"
- },
- "event": {
- "$ref": "#/components/schemas/ChatCompletionResponseEvent",
- "description": "The event containing the new content"
- }
- },
- "additionalProperties": false,
- "required": [
- "event"
- ],
- "title": "ChatCompletionResponseStreamChunk",
- "description": "A chunk of a streamed chat completion response."
- },
- "ImageDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "image",
- "default": "image",
- "description": "Discriminator type of the delta. Always \"image\""
- },
- "image": {
- "type": "string",
- "contentEncoding": "base64",
- "description": "The incremental image data as bytes"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "image"
- ],
- "title": "ImageDelta",
- "description": "An image content delta for streaming responses."
- },
- "TextDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "text",
- "default": "text",
- "description": "Discriminator type of the delta. Always \"text\""
- },
- "text": {
- "type": "string",
- "description": "The incremental text content"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "text"
- ],
- "title": "TextDelta",
- "description": "A text content delta for streaming responses."
- },
- "ToolCallDelta": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "tool_call",
- "default": "tool_call",
- "description": "Discriminator type of the delta. Always \"tool_call\""
- },
- "tool_call": {
- "oneOf": [
- {
- "type": "string"
- },
- {
- "$ref": "#/components/schemas/ToolCall"
- }
- ],
- "description": "Either an in-progress tool call string or the final parsed tool call"
- },
- "parse_status": {
- "type": "string",
- "enum": [
- "started",
- "in_progress",
- "failed",
- "succeeded"
- ],
- "description": "Current parsing status of the tool call"
- }
- },
- "additionalProperties": false,
- "required": [
- "type",
- "tool_call",
- "parse_status"
- ],
- "title": "ToolCallDelta",
- "description": "A tool call content delta for streaming responses."
- },
"AgentConfig": {
"type": "object",
"properties": {
@@ -7307,6 +6212,231 @@
}
]
},
+ "GrammarResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "json_schema",
+ "grammar"
+ ],
+ "description": "Must be \"grammar\" to identify this format type",
+ "const": "grammar",
+ "default": "grammar"
+ },
+ "bnf": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The BNF grammar specification the response should conform to"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "bnf"
+ ],
+ "title": "GrammarResponseFormat",
+ "description": "Configuration for grammar-guided response generation."
+ },
+ "GreedySamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "greedy",
+ "default": "greedy",
+ "description": "Must be \"greedy\" to identify this sampling strategy"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "GreedySamplingStrategy",
+ "description": "Greedy sampling strategy that selects the highest probability token at each step."
+ },
+ "JsonSchemaResponseFormat": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": [
+ "json_schema",
+ "grammar"
+ ],
+ "description": "Must be \"json_schema\" to identify this format type",
+ "const": "json_schema",
+ "default": "json_schema"
+ },
+ "json_schema": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "json_schema"
+ ],
+ "title": "JsonSchemaResponseFormat",
+ "description": "Configuration for JSON schema-guided response generation."
+ },
+ "ResponseFormat": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+ },
+ {
+ "$ref": "#/components/schemas/GrammarResponseFormat"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+ "grammar": "#/components/schemas/GrammarResponseFormat"
+ }
+ }
+ },
+ "SamplingParams": {
+ "type": "object",
+ "properties": {
+ "strategy": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/GreedySamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopPSamplingStrategy"
+ },
+ {
+ "$ref": "#/components/schemas/TopKSamplingStrategy"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "greedy": "#/components/schemas/GreedySamplingStrategy",
+ "top_p": "#/components/schemas/TopPSamplingStrategy",
+ "top_k": "#/components/schemas/TopKSamplingStrategy"
+ }
+ },
+ "description": "The sampling strategy."
+ },
+ "max_tokens": {
+ "type": "integer",
+ "default": 0,
+ "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
+ },
+ "repetition_penalty": {
+ "type": "number",
+ "default": 1.0,
+ "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
+ },
+ "stop": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "strategy"
+ ],
+ "title": "SamplingParams",
+ "description": "Sampling parameters."
+ },
+ "ToolConfig": {
+ "type": "object",
+ "properties": {
+ "tool_choice": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "auto",
+ "required",
+ "none"
+ ],
+ "title": "ToolChoice",
+ "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+ },
+ {
+ "type": "string"
+ }
+ ],
+ "default": "auto",
+ "description": "(Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto."
+ },
+ "tool_prompt_format": {
+ "type": "string",
+ "enum": [
+ "json",
+ "function_tag",
+ "python_list"
+ ],
+ "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+ },
+ "system_message_behavior": {
+ "type": "string",
+ "enum": [
+ "append",
+ "replace"
+ ],
+ "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+ "default": "append"
+ }
+ },
+ "additionalProperties": false,
+ "title": "ToolConfig",
+ "description": "Configuration for tool use."
+ },
"ToolDef": {
"type": "object",
"properties": {
@@ -7421,6 +6551,54 @@
"title": "ToolParameter",
"description": "Parameter definition for a tool."
},
+ "TopKSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_k",
+ "default": "top_k",
+ "description": "Must be \"top_k\" to identify this sampling strategy"
+ },
+ "top_k": {
+ "type": "integer",
+ "description": "Number of top tokens to consider for sampling. Must be at least 1"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "top_k"
+ ],
+ "title": "TopKSamplingStrategy",
+ "description": "Top-k sampling strategy that restricts sampling to the k most likely tokens."
+ },
+ "TopPSamplingStrategy": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "top_p",
+ "default": "top_p",
+ "description": "Must be \"top_p\" to identify this sampling strategy"
+ },
+ "temperature": {
+ "type": "number",
+ "description": "Controls randomness in sampling. Higher values increase randomness"
+ },
+ "top_p": {
+ "type": "number",
+ "default": 0.95,
+ "description": "Cumulative probability threshold for nucleus sampling. Defaults to 0.95"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "TopPSamplingStrategy",
+ "description": "Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p."
+ },
"CreateAgentRequest": {
"type": "object",
"properties": {
@@ -7479,6 +6657,163 @@
"title": "AgentSessionCreateResponse",
"description": "Response returned when creating a new agent session."
},
+ "ImageContentItem": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "image",
+ "default": "image",
+ "description": "Discriminator type of the content item. Always \"image\""
+ },
+ "image": {
+ "type": "object",
+ "properties": {
+ "url": {
+ "$ref": "#/components/schemas/URL",
+ "description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
+ },
+ "data": {
+ "type": "string",
+ "contentEncoding": "base64",
+ "description": "base64 encoded image data as string"
+ }
+ },
+ "additionalProperties": false,
+ "description": "Image as a base64 encoded string or an URL"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "image"
+ ],
+ "title": "ImageContentItem",
+ "description": "A image content item"
+ },
+ "InterleavedContent": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/InterleavedContentItem"
+ },
+ {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/InterleavedContentItem"
+ }
+ }
+ ]
+ },
+ "InterleavedContentItem": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/ImageContentItem"
+ },
+ {
+ "$ref": "#/components/schemas/TextContentItem"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "image": "#/components/schemas/ImageContentItem",
+ "text": "#/components/schemas/TextContentItem"
+ }
+ }
+ },
+ "TextContentItem": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text",
+ "description": "Discriminator type of the content item. Always \"text\""
+ },
+ "text": {
+ "type": "string",
+ "description": "Text content"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "text"
+ ],
+ "title": "TextContentItem",
+ "description": "A text content item"
+ },
+ "ToolResponseMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "tool",
+ "default": "tool",
+ "description": "Must be \"tool\" to identify this as a tool response"
+ },
+ "call_id": {
+ "type": "string",
+ "description": "Unique identifier for the tool call this response is for"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The response content from the tool"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "call_id",
+ "content"
+ ],
+ "title": "ToolResponseMessage",
+ "description": "A message representing the result of a tool invocation."
+ },
+ "URL": {
+ "type": "object",
+ "properties": {
+ "uri": {
+ "type": "string",
+ "description": "The URL string pointing to the resource"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "uri"
+ ],
+ "title": "URL",
+ "description": "A URL reference to external content."
+ },
+ "UserMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "user",
+ "default": "user",
+ "description": "Must be \"user\" to identify this as a user message"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the message, which can include text and other media"
+ },
+ "context": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content"
+ ],
+ "title": "UserMessage",
+ "description": "A message from the user in a chat conversation."
+ },
"CreateAgentTurnRequest": {
"type": "object",
"properties": {
@@ -7558,6 +6893,45 @@
],
"title": "CreateAgentTurnRequest"
},
+ "CompletionMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "assistant",
+ "default": "assistant",
+ "description": "Must be \"assistant\" to identify this as the model's response"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the model's response"
+ },
+ "stop_reason": {
+ "type": "string",
+ "enum": [
+ "end_of_turn",
+ "end_of_message",
+ "out_of_tokens"
+ ],
+ "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+ },
+ "tool_calls": {
+ "type": "array",
+ "items": {
+ "$ref": "#/components/schemas/ToolCall"
+ },
+ "description": "List of tool calls. Each tool call is a ToolCall object."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content",
+ "stop_reason"
+ ],
+ "title": "CompletionMessage",
+ "description": "A message containing the model's (assistant) response in a chat conversation."
+ },
"InferenceStep": {
"type": "object",
"properties": {
@@ -7755,6 +7129,114 @@
"title": "ShieldCallStep",
"description": "A shield call step in an agent turn."
},
+ "ToolCall": {
+ "type": "object",
+ "properties": {
+ "call_id": {
+ "type": "string"
+ },
+ "tool_name": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "brave_search",
+ "wolfram_alpha",
+ "photogen",
+ "code_interpreter"
+ ],
+ "title": "BuiltinTool"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ },
+ "arguments": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "integer"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "null"
+ }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "arguments_json": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "call_id",
+ "tool_name",
+ "arguments"
+ ],
+ "title": "ToolCall"
+ },
"ToolExecutionStep": {
"type": "object",
"properties": {
@@ -8360,6 +7842,91 @@
"title": "AgentTurnResponseTurnStartPayload",
"description": "Payload for turn start events in agent turn responses."
},
+ "ImageDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "image",
+ "default": "image",
+ "description": "Discriminator type of the delta. Always \"image\""
+ },
+ "image": {
+ "type": "string",
+ "contentEncoding": "base64",
+ "description": "The incremental image data as bytes"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "image"
+ ],
+ "title": "ImageDelta",
+ "description": "An image content delta for streaming responses."
+ },
+ "TextDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "text",
+ "default": "text",
+ "description": "Discriminator type of the delta. Always \"text\""
+ },
+ "text": {
+ "type": "string",
+ "description": "The incremental text content"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "text"
+ ],
+ "title": "TextDelta",
+ "description": "A text content delta for streaming responses."
+ },
+ "ToolCallDelta": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "tool_call",
+ "default": "tool_call",
+ "description": "Discriminator type of the delta. Always \"tool_call\""
+ },
+ "tool_call": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "$ref": "#/components/schemas/ToolCall"
+ }
+ ],
+ "description": "Either an in-progress tool call string or the final parsed tool call"
+ },
+ "parse_status": {
+ "type": "string",
+ "enum": [
+ "started",
+ "in_progress",
+ "failed",
+ "succeeded"
+ ],
+ "description": "Current parsing status of the tool call"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "tool_call",
+ "parse_status"
+ ],
+ "title": "ToolCallDelta",
+ "description": "A tool call content delta for streaming responses."
+ },
"OpenAIResponseAnnotationCitation": {
"type": "object",
"properties": {
@@ -10761,6 +10328,28 @@
"title": "ScoringFnParamsType",
"description": "Types of scoring function parameter configurations."
},
+ "SystemMessage": {
+ "type": "object",
+ "properties": {
+ "role": {
+ "type": "string",
+ "const": "system",
+ "default": "system",
+ "description": "Must be \"system\" to identify this as a system message"
+ },
+ "content": {
+ "$ref": "#/components/schemas/InterleavedContent",
+ "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "role",
+ "content"
+ ],
+ "title": "SystemMessage",
+ "description": "A system message providing instructions or context to the model."
+ },
"EvaluateRowsRequest": {
"type": "object",
"properties": {
@@ -17746,6 +17335,31 @@
"title": "ModerationObjectResults",
"description": "A moderation object."
},
+ "Message": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/UserMessage"
+ },
+ {
+ "$ref": "#/components/schemas/SystemMessage"
+ },
+ {
+ "$ref": "#/components/schemas/ToolResponseMessage"
+ },
+ {
+ "$ref": "#/components/schemas/CompletionMessage"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "role",
+ "mapping": {
+ "user": "#/components/schemas/UserMessage",
+ "system": "#/components/schemas/SystemMessage",
+ "tool": "#/components/schemas/ToolResponseMessage",
+ "assistant": "#/components/schemas/CompletionMessage"
+ }
+ }
+ },
"RunShieldRequest": {
"type": "object",
"properties": {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index f2a618b3a..98b790a49 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -95,43 +95,6 @@ paths:
schema:
$ref: '#/components/schemas/CancelTrainingJobRequest'
required: true
- /v1/inference/chat-completion:
- post:
- responses:
- '200':
- description: >-
- If stream=False, returns a ChatCompletionResponse with the full completion.
- If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ChatCompletionResponse'
- text/event-stream:
- schema:
- $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
- '400':
- $ref: '#/components/responses/BadRequest400'
- '429':
- $ref: >-
- #/components/responses/TooManyRequests429
- '500':
- $ref: >-
- #/components/responses/InternalServerError500
- default:
- $ref: '#/components/responses/DefaultError'
- tags:
- - Inference
- summary: >-
- Generate a chat completion for the given messages using the specified model.
- description: >-
- Generate a chat completion for the given messages using the specified model.
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ChatCompletionRequest'
- required: true
/v1/agents:
get:
responses:
@@ -4397,801 +4360,6 @@ components:
required:
- job_uuid
title: CancelTrainingJobRequest
- CompletionMessage:
- type: object
- properties:
- role:
- type: string
- const: assistant
- default: assistant
- description: >-
- Must be "assistant" to identify this as the model's response
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: The content of the model's response
- stop_reason:
- type: string
- enum:
- - end_of_turn
- - end_of_message
- - out_of_tokens
- description: >-
- Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
- The model finished generating the entire response. - `StopReason.end_of_message`:
- The model finished generating but generated a partial response -- usually,
- a tool call. The user may call the tool and continue the conversation
- with the tool's response. - `StopReason.out_of_tokens`: The model ran
- out of token budget.
- tool_calls:
- type: array
- items:
- $ref: '#/components/schemas/ToolCall'
- description: >-
- List of tool calls. Each tool call is a ToolCall object.
- additionalProperties: false
- required:
- - role
- - content
- - stop_reason
- title: CompletionMessage
- description: >-
- A message containing the model's (assistant) response in a chat conversation.
- GrammarResponseFormat:
- type: object
- properties:
- type:
- type: string
- enum:
- - json_schema
- - grammar
- description: >-
- Must be "grammar" to identify this format type
- const: grammar
- default: grammar
- bnf:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The BNF grammar specification the response should conform to
- additionalProperties: false
- required:
- - type
- - bnf
- title: GrammarResponseFormat
- description: >-
- Configuration for grammar-guided response generation.
- GreedySamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: greedy
- default: greedy
- description: >-
- Must be "greedy" to identify this sampling strategy
- additionalProperties: false
- required:
- - type
- title: GreedySamplingStrategy
- description: >-
- Greedy sampling strategy that selects the highest probability token at each
- step.
- ImageContentItem:
- type: object
- properties:
- type:
- type: string
- const: image
- default: image
- description: >-
- Discriminator type of the content item. Always "image"
- image:
- type: object
- properties:
- url:
- $ref: '#/components/schemas/URL'
- description: >-
- A URL of the image or data URL in the format of data:image/{type};base64,{data}.
- Note that URL could have length limits.
- data:
- type: string
- contentEncoding: base64
- description: base64 encoded image data as string
- additionalProperties: false
- description: >-
- Image as a base64 encoded string or an URL
- additionalProperties: false
- required:
- - type
- - image
- title: ImageContentItem
- description: A image content item
- InterleavedContent:
- oneOf:
- - type: string
- - $ref: '#/components/schemas/InterleavedContentItem'
- - type: array
- items:
- $ref: '#/components/schemas/InterleavedContentItem'
- InterleavedContentItem:
- oneOf:
- - $ref: '#/components/schemas/ImageContentItem'
- - $ref: '#/components/schemas/TextContentItem'
- discriminator:
- propertyName: type
- mapping:
- image: '#/components/schemas/ImageContentItem'
- text: '#/components/schemas/TextContentItem'
- JsonSchemaResponseFormat:
- type: object
- properties:
- type:
- type: string
- enum:
- - json_schema
- - grammar
- description: >-
- Must be "json_schema" to identify this format type
- const: json_schema
- default: json_schema
- json_schema:
- type: object
- additionalProperties:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- description: >-
- The JSON schema the response should conform to. In a Python SDK, this
- is often a `pydantic` model.
- additionalProperties: false
- required:
- - type
- - json_schema
- title: JsonSchemaResponseFormat
- description: >-
- Configuration for JSON schema-guided response generation.
- Message:
- oneOf:
- - $ref: '#/components/schemas/UserMessage'
- - $ref: '#/components/schemas/SystemMessage'
- - $ref: '#/components/schemas/ToolResponseMessage'
- - $ref: '#/components/schemas/CompletionMessage'
- discriminator:
- propertyName: role
- mapping:
- user: '#/components/schemas/UserMessage'
- system: '#/components/schemas/SystemMessage'
- tool: '#/components/schemas/ToolResponseMessage'
- assistant: '#/components/schemas/CompletionMessage'
- ResponseFormat:
- oneOf:
- - $ref: '#/components/schemas/JsonSchemaResponseFormat'
- - $ref: '#/components/schemas/GrammarResponseFormat'
- discriminator:
- propertyName: type
- mapping:
- json_schema: '#/components/schemas/JsonSchemaResponseFormat'
- grammar: '#/components/schemas/GrammarResponseFormat'
- SamplingParams:
- type: object
- properties:
- strategy:
- oneOf:
- - $ref: '#/components/schemas/GreedySamplingStrategy'
- - $ref: '#/components/schemas/TopPSamplingStrategy'
- - $ref: '#/components/schemas/TopKSamplingStrategy'
- discriminator:
- propertyName: type
- mapping:
- greedy: '#/components/schemas/GreedySamplingStrategy'
- top_p: '#/components/schemas/TopPSamplingStrategy'
- top_k: '#/components/schemas/TopKSamplingStrategy'
- description: The sampling strategy.
- max_tokens:
- type: integer
- default: 0
- description: >-
- The maximum number of tokens that can be generated in the completion.
- The token count of your prompt plus max_tokens cannot exceed the model's
- context length.
- repetition_penalty:
- type: number
- default: 1.0
- description: >-
- Number between -2.0 and 2.0. Positive values penalize new tokens based
- on whether they appear in the text so far, increasing the model's likelihood
- to talk about new topics.
- stop:
- type: array
- items:
- type: string
- description: >-
- Up to 4 sequences where the API will stop generating further tokens. The
- returned text will not contain the stop sequence.
- additionalProperties: false
- required:
- - strategy
- title: SamplingParams
- description: Sampling parameters.
- SystemMessage:
- type: object
- properties:
- role:
- type: string
- const: system
- default: system
- description: >-
- Must be "system" to identify this as a system message
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- The content of the "system prompt". If multiple system messages are provided,
- they are concatenated. The underlying Llama Stack code may also add other
- system messages (for example, for formatting tool definitions).
- additionalProperties: false
- required:
- - role
- - content
- title: SystemMessage
- description: >-
- A system message providing instructions or context to the model.
- TextContentItem:
- type: object
- properties:
- type:
- type: string
- const: text
- default: text
- description: >-
- Discriminator type of the content item. Always "text"
- text:
- type: string
- description: Text content
- additionalProperties: false
- required:
- - type
- - text
- title: TextContentItem
- description: A text content item
- ToolCall:
- type: object
- properties:
- call_id:
- type: string
- tool_name:
- oneOf:
- - type: string
- enum:
- - brave_search
- - wolfram_alpha
- - photogen
- - code_interpreter
- title: BuiltinTool
- - type: string
- arguments:
- oneOf:
- - type: string
- - type: object
- additionalProperties:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- - type: array
- items:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- - type: object
- additionalProperties:
- oneOf:
- - type: string
- - type: integer
- - type: number
- - type: boolean
- - type: 'null'
- arguments_json:
- type: string
- additionalProperties: false
- required:
- - call_id
- - tool_name
- - arguments
- title: ToolCall
- ToolConfig:
- type: object
- properties:
- tool_choice:
- oneOf:
- - type: string
- enum:
- - auto
- - required
- - none
- title: ToolChoice
- description: >-
- Whether tool use is required or automatic. This is a hint to the model
- which may not be followed. It depends on the Instruction Following
- capabilities of the model.
- - type: string
- default: auto
- description: >-
- (Optional) Whether tool use is automatic, required, or none. Can also
- specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
- tool_prompt_format:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- description: >-
- (Optional) Instructs the model how to format tool calls. By default, Llama
- Stack will attempt to use a format that is best adapted to the model.
- - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
- syntax -- a list of function calls.
- system_message_behavior:
- type: string
- enum:
- - append
- - replace
- description: >-
- (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
- Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
- Replaces the default system prompt with the provided system message. The
- system message can include the string '{{function_definitions}}' to indicate
- where the function definitions should be inserted.
- default: append
- additionalProperties: false
- title: ToolConfig
- description: Configuration for tool use.
- ToolDefinition:
- type: object
- properties:
- tool_name:
- oneOf:
- - type: string
- enum:
- - brave_search
- - wolfram_alpha
- - photogen
- - code_interpreter
- title: BuiltinTool
- - type: string
- description:
- type: string
- parameters:
- type: object
- additionalProperties:
- $ref: '#/components/schemas/ToolParamDefinition'
- additionalProperties: false
- required:
- - tool_name
- title: ToolDefinition
- ToolParamDefinition:
- type: object
- properties:
- param_type:
- type: string
- description:
- type: string
- required:
- type: boolean
- default: true
- items:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- title:
- type: string
- default:
- oneOf:
- - type: 'null'
- - type: boolean
- - type: number
- - type: string
- - type: array
- - type: object
- additionalProperties: false
- required:
- - param_type
- title: ToolParamDefinition
- ToolResponseMessage:
- type: object
- properties:
- role:
- type: string
- const: tool
- default: tool
- description: >-
- Must be "tool" to identify this as a tool response
- call_id:
- type: string
- description: >-
- Unique identifier for the tool call this response is for
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: The response content from the tool
- additionalProperties: false
- required:
- - role
- - call_id
- - content
- title: ToolResponseMessage
- description: >-
- A message representing the result of a tool invocation.
- TopKSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_k
- default: top_k
- description: >-
- Must be "top_k" to identify this sampling strategy
- top_k:
- type: integer
- description: >-
- Number of top tokens to consider for sampling. Must be at least 1
- additionalProperties: false
- required:
- - type
- - top_k
- title: TopKSamplingStrategy
- description: >-
- Top-k sampling strategy that restricts sampling to the k most likely tokens.
- TopPSamplingStrategy:
- type: object
- properties:
- type:
- type: string
- const: top_p
- default: top_p
- description: >-
- Must be "top_p" to identify this sampling strategy
- temperature:
- type: number
- description: >-
- Controls randomness in sampling. Higher values increase randomness
- top_p:
- type: number
- default: 0.95
- description: >-
- Cumulative probability threshold for nucleus sampling. Defaults to 0.95
- additionalProperties: false
- required:
- - type
- title: TopPSamplingStrategy
- description: >-
- Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
- with cumulative probability >= p.
- URL:
- type: object
- properties:
- uri:
- type: string
- description: The URL string pointing to the resource
- additionalProperties: false
- required:
- - uri
- title: URL
- description: A URL reference to external content.
- UserMessage:
- type: object
- properties:
- role:
- type: string
- const: user
- default: user
- description: >-
- Must be "user" to identify this as a user message
- content:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- The content of the message, which can include text and other media
- context:
- $ref: '#/components/schemas/InterleavedContent'
- description: >-
- (Optional) This field is used internally by Llama Stack to pass RAG context.
- This field may be removed in the API in the future.
- additionalProperties: false
- required:
- - role
- - content
- title: UserMessage
- description: >-
- A message from the user in a chat conversation.
- ChatCompletionRequest:
- type: object
- properties:
- model_id:
- type: string
- description: >-
- The identifier of the model to use. The model must be registered with
- Llama Stack and available via the /models endpoint.
- messages:
- type: array
- items:
- $ref: '#/components/schemas/Message'
- description: List of messages in the conversation.
- sampling_params:
- $ref: '#/components/schemas/SamplingParams'
- description: >-
- Parameters to control the sampling strategy.
- tools:
- type: array
- items:
- $ref: '#/components/schemas/ToolDefinition'
- description: >-
- (Optional) List of tool definitions available to the model.
- tool_choice:
- type: string
- enum:
- - auto
- - required
- - none
- description: >-
- (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
- .. deprecated:: Use tool_config instead.
- tool_prompt_format:
- type: string
- enum:
- - json
- - function_tag
- - python_list
- description: >-
- (Optional) Instructs the model how to format tool calls. By default, Llama
- Stack will attempt to use a format that is best adapted to the model.
- - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
- syntax -- a list of function calls. .. deprecated:: Use tool_config instead.
- response_format:
- $ref: '#/components/schemas/ResponseFormat'
- description: >-
- (Optional) Grammar specification for guided (structured) decoding. There
- are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
- schema. Most providers support this format. - `ResponseFormat.grammar`:
- The grammar is a BNF grammar. This format is more flexible, but not all
- providers support it.
- stream:
- type: boolean
- description: >-
- (Optional) If True, generate an SSE event stream of the response. Defaults
- to False.
- logprobs:
- type: object
- properties:
- top_k:
- type: integer
- default: 0
- description: >-
- How many tokens (for each position) to return log probabilities for.
- additionalProperties: false
- description: >-
- (Optional) If specified, log probabilities for each token position will
- be returned.
- tool_config:
- $ref: '#/components/schemas/ToolConfig'
- description: (Optional) Configuration for tool use.
- additionalProperties: false
- required:
- - model_id
- - messages
- title: ChatCompletionRequest
- ChatCompletionResponse:
- type: object
- properties:
- metrics:
- type: array
- items:
- $ref: '#/components/schemas/MetricInResponse'
- description: >-
- (Optional) List of metrics associated with the API response
- completion_message:
- $ref: '#/components/schemas/CompletionMessage'
- description: The complete response message
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- description: >-
- Optional log probabilities for generated tokens
- additionalProperties: false
- required:
- - completion_message
- title: ChatCompletionResponse
- description: Response from a chat completion request.
- MetricInResponse:
- type: object
- properties:
- metric:
- type: string
- description: The name of the metric
- value:
- oneOf:
- - type: integer
- - type: number
- description: The numeric value of the metric
- unit:
- type: string
- description: >-
- (Optional) The unit of measurement for the metric value
- additionalProperties: false
- required:
- - metric
- - value
- title: MetricInResponse
- description: >-
- A metric value included in API responses.
- TokenLogProbs:
- type: object
- properties:
- logprobs_by_token:
- type: object
- additionalProperties:
- type: number
- description: >-
- Dictionary mapping tokens to their log probabilities
- additionalProperties: false
- required:
- - logprobs_by_token
- title: TokenLogProbs
- description: Log probabilities for generated tokens.
- ChatCompletionResponseEvent:
- type: object
- properties:
- event_type:
- type: string
- enum:
- - start
- - complete
- - progress
- description: Type of the event
- delta:
- oneOf:
- - $ref: '#/components/schemas/TextDelta'
- - $ref: '#/components/schemas/ImageDelta'
- - $ref: '#/components/schemas/ToolCallDelta'
- discriminator:
- propertyName: type
- mapping:
- text: '#/components/schemas/TextDelta'
- image: '#/components/schemas/ImageDelta'
- tool_call: '#/components/schemas/ToolCallDelta'
- description: >-
- Content generated since last event. This can be one or more tokens, or
- a tool call.
- logprobs:
- type: array
- items:
- $ref: '#/components/schemas/TokenLogProbs'
- description: >-
- Optional log probabilities for generated tokens
- stop_reason:
- type: string
- enum:
- - end_of_turn
- - end_of_message
- - out_of_tokens
- description: >-
- Optional reason why generation stopped, if complete
- additionalProperties: false
- required:
- - event_type
- - delta
- title: ChatCompletionResponseEvent
- description: >-
- An event during chat completion generation.
- ChatCompletionResponseStreamChunk:
- type: object
- properties:
- metrics:
- type: array
- items:
- $ref: '#/components/schemas/MetricInResponse'
- description: >-
- (Optional) List of metrics associated with the API response
- event:
- $ref: '#/components/schemas/ChatCompletionResponseEvent'
- description: The event containing the new content
- additionalProperties: false
- required:
- - event
- title: ChatCompletionResponseStreamChunk
- description: >-
- A chunk of a streamed chat completion response.
- ImageDelta:
- type: object
- properties:
- type:
- type: string
- const: image
- default: image
- description: >-
- Discriminator type of the delta. Always "image"
- image:
- type: string
- contentEncoding: base64
- description: The incremental image data as bytes
- additionalProperties: false
- required:
- - type
- - image
- title: ImageDelta
- description: >-
- An image content delta for streaming responses.
- TextDelta:
- type: object
- properties:
- type:
- type: string
- const: text
- default: text
- description: >-
- Discriminator type of the delta. Always "text"
- text:
- type: string
- description: The incremental text content
- additionalProperties: false
- required:
- - type
- - text
- title: TextDelta
- description: >-
- A text content delta for streaming responses.
- ToolCallDelta:
- type: object
- properties:
- type:
- type: string
- const: tool_call
- default: tool_call
- description: >-
- Discriminator type of the delta. Always "tool_call"
- tool_call:
- oneOf:
- - type: string
- - $ref: '#/components/schemas/ToolCall'
- description: >-
- Either an in-progress tool call string or the final parsed tool call
- parse_status:
- type: string
- enum:
- - started
- - in_progress
- - failed
- - succeeded
- description: Current parsing status of the tool call
- additionalProperties: false
- required:
- - type
- - tool_call
- - parse_status
- title: ToolCallDelta
- description: >-
- A tool call content delta for streaming responses.
AgentConfig:
type: object
properties:
@@ -5287,6 +4455,183 @@ components:
- name
- args
title: AgentToolGroupWithArgs
+ GrammarResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ enum:
+ - json_schema
+ - grammar
+ description: >-
+ Must be "grammar" to identify this format type
+ const: grammar
+ default: grammar
+ bnf:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The BNF grammar specification the response should conform to
+ additionalProperties: false
+ required:
+ - type
+ - bnf
+ title: GrammarResponseFormat
+ description: >-
+ Configuration for grammar-guided response generation.
+ GreedySamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: greedy
+ default: greedy
+ description: >-
+ Must be "greedy" to identify this sampling strategy
+ additionalProperties: false
+ required:
+ - type
+ title: GreedySamplingStrategy
+ description: >-
+ Greedy sampling strategy that selects the highest probability token at each
+ step.
+ JsonSchemaResponseFormat:
+ type: object
+ properties:
+ type:
+ type: string
+ enum:
+ - json_schema
+ - grammar
+ description: >-
+ Must be "json_schema" to identify this format type
+ const: json_schema
+ default: json_schema
+ json_schema:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The JSON schema the response should conform to. In a Python SDK, this
+ is often a `pydantic` model.
+ additionalProperties: false
+ required:
+ - type
+ - json_schema
+ title: JsonSchemaResponseFormat
+ description: >-
+ Configuration for JSON schema-guided response generation.
+ ResponseFormat:
+ oneOf:
+ - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+ - $ref: '#/components/schemas/GrammarResponseFormat'
+ discriminator:
+ propertyName: type
+ mapping:
+ json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+ grammar: '#/components/schemas/GrammarResponseFormat'
+ SamplingParams:
+ type: object
+ properties:
+ strategy:
+ oneOf:
+ - $ref: '#/components/schemas/GreedySamplingStrategy'
+ - $ref: '#/components/schemas/TopPSamplingStrategy'
+ - $ref: '#/components/schemas/TopKSamplingStrategy'
+ discriminator:
+ propertyName: type
+ mapping:
+ greedy: '#/components/schemas/GreedySamplingStrategy'
+ top_p: '#/components/schemas/TopPSamplingStrategy'
+ top_k: '#/components/schemas/TopKSamplingStrategy'
+ description: The sampling strategy.
+ max_tokens:
+ type: integer
+ default: 0
+ description: >-
+ The maximum number of tokens that can be generated in the completion.
+ The token count of your prompt plus max_tokens cannot exceed the model's
+ context length.
+ repetition_penalty:
+ type: number
+ default: 1.0
+ description: >-
+ Number between -2.0 and 2.0. Positive values penalize new tokens based
+ on whether they appear in the text so far, increasing the model's likelihood
+ to talk about new topics.
+ stop:
+ type: array
+ items:
+ type: string
+ description: >-
+ Up to 4 sequences where the API will stop generating further tokens. The
+ returned text will not contain the stop sequence.
+ additionalProperties: false
+ required:
+ - strategy
+ title: SamplingParams
+ description: Sampling parameters.
+ ToolConfig:
+ type: object
+ properties:
+ tool_choice:
+ oneOf:
+ - type: string
+ enum:
+ - auto
+ - required
+ - none
+ title: ToolChoice
+ description: >-
+ Whether tool use is required or automatic. This is a hint to the model
+ which may not be followed. It depends on the Instruction Following
+ capabilities of the model.
+ - type: string
+ default: auto
+ description: >-
+ (Optional) Whether tool use is automatic, required, or none. Can also
+ specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
+ tool_prompt_format:
+ type: string
+ enum:
+ - json
+ - function_tag
+ - python_list
+ description: >-
+ (Optional) Instructs the model how to format tool calls. By default, Llama
+ Stack will attempt to use a format that is best adapted to the model.
+ - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+ - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
+ tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+ syntax -- a list of function calls.
+ system_message_behavior:
+ type: string
+ enum:
+ - append
+ - replace
+ description: >-
+ (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+ Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+ Replaces the default system prompt with the provided system message. The
+ system message can include the string '{{function_definitions}}' to indicate
+ where the function definitions should be inserted.
+ default: append
+ additionalProperties: false
+ title: ToolConfig
+ description: Configuration for tool use.
ToolDef:
type: object
properties:
@@ -5365,6 +4710,51 @@ components:
- required
title: ToolParameter
description: Parameter definition for a tool.
+ TopKSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_k
+ default: top_k
+ description: >-
+ Must be "top_k" to identify this sampling strategy
+ top_k:
+ type: integer
+ description: >-
+ Number of top tokens to consider for sampling. Must be at least 1
+ additionalProperties: false
+ required:
+ - type
+ - top_k
+ title: TopKSamplingStrategy
+ description: >-
+ Top-k sampling strategy that restricts sampling to the k most likely tokens.
+ TopPSamplingStrategy:
+ type: object
+ properties:
+ type:
+ type: string
+ const: top_p
+ default: top_p
+ description: >-
+ Must be "top_p" to identify this sampling strategy
+ temperature:
+ type: number
+ description: >-
+ Controls randomness in sampling. Higher values increase randomness
+ top_p:
+ type: number
+ default: 0.95
+ description: >-
+ Cumulative probability threshold for nucleus sampling. Defaults to 0.95
+ additionalProperties: false
+ required:
+ - type
+ title: TopPSamplingStrategy
+ description: >-
+ Top-p (nucleus) sampling strategy that samples from the smallest set of tokens
+ with cumulative probability >= p.
CreateAgentRequest:
type: object
properties:
@@ -5410,6 +4800,130 @@ components:
title: AgentSessionCreateResponse
description: >-
Response returned when creating a new agent session.
+ ImageContentItem:
+ type: object
+ properties:
+ type:
+ type: string
+ const: image
+ default: image
+ description: >-
+ Discriminator type of the content item. Always "image"
+ image:
+ type: object
+ properties:
+ url:
+ $ref: '#/components/schemas/URL'
+ description: >-
+ A URL of the image or data URL in the format of data:image/{type};base64,{data}.
+ Note that URL could have length limits.
+ data:
+ type: string
+ contentEncoding: base64
+ description: base64 encoded image data as string
+ additionalProperties: false
+ description: >-
+ Image as a base64 encoded string or an URL
+ additionalProperties: false
+ required:
+ - type
+ - image
+ title: ImageContentItem
+ description: A image content item
+ InterleavedContent:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/InterleavedContentItem'
+ - type: array
+ items:
+ $ref: '#/components/schemas/InterleavedContentItem'
+ InterleavedContentItem:
+ oneOf:
+ - $ref: '#/components/schemas/ImageContentItem'
+ - $ref: '#/components/schemas/TextContentItem'
+ discriminator:
+ propertyName: type
+ mapping:
+ image: '#/components/schemas/ImageContentItem'
+ text: '#/components/schemas/TextContentItem'
+ TextContentItem:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ description: >-
+ Discriminator type of the content item. Always "text"
+ text:
+ type: string
+ description: Text content
+ additionalProperties: false
+ required:
+ - type
+ - text
+ title: TextContentItem
+ description: A text content item
+ ToolResponseMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: tool
+ default: tool
+ description: >-
+ Must be "tool" to identify this as a tool response
+ call_id:
+ type: string
+ description: >-
+ Unique identifier for the tool call this response is for
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: The response content from the tool
+ additionalProperties: false
+ required:
+ - role
+ - call_id
+ - content
+ title: ToolResponseMessage
+ description: >-
+ A message representing the result of a tool invocation.
+ URL:
+ type: object
+ properties:
+ uri:
+ type: string
+ description: The URL string pointing to the resource
+ additionalProperties: false
+ required:
+ - uri
+ title: URL
+ description: A URL reference to external content.
+ UserMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: user
+ default: user
+ description: >-
+ Must be "user" to identify this as a user message
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the message, which can include text and other media
+ context:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ (Optional) This field is used internally by Llama Stack to pass RAG context.
+ This field may be removed in the API in the future.
+ additionalProperties: false
+ required:
+ - role
+ - content
+ title: UserMessage
+ description: >-
+ A message from the user in a chat conversation.
CreateAgentTurnRequest:
type: object
properties:
@@ -5466,6 +4980,45 @@ components:
required:
- messages
title: CreateAgentTurnRequest
+ CompletionMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: assistant
+ default: assistant
+ description: >-
+ Must be "assistant" to identify this as the model's response
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: The content of the model's response
+ stop_reason:
+ type: string
+ enum:
+ - end_of_turn
+ - end_of_message
+ - out_of_tokens
+ description: >-
+ Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+ The model finished generating the entire response. - `StopReason.end_of_message`:
+ The model finished generating but generated a partial response -- usually,
+ a tool call. The user may call the tool and continue the conversation
+ with the tool's response. - `StopReason.out_of_tokens`: The model ran
+ out of token budget.
+ tool_calls:
+ type: array
+ items:
+ $ref: '#/components/schemas/ToolCall'
+ description: >-
+ List of tool calls. Each tool call is a ToolCall object.
+ additionalProperties: false
+ required:
+ - role
+ - content
+ - stop_reason
+ title: CompletionMessage
+ description: >-
+ A message containing the model's (assistant) response in a chat conversation.
InferenceStep:
type: object
properties:
@@ -5619,6 +5172,56 @@ components:
- step_type
title: ShieldCallStep
description: A shield call step in an agent turn.
+ ToolCall:
+ type: object
+ properties:
+ call_id:
+ type: string
+ tool_name:
+ oneOf:
+ - type: string
+ enum:
+ - brave_search
+ - wolfram_alpha
+ - photogen
+ - code_interpreter
+ title: BuiltinTool
+ - type: string
+ arguments:
+ oneOf:
+ - type: string
+ - type: object
+ additionalProperties:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ - type: array
+ items:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ - type: object
+ additionalProperties:
+ oneOf:
+ - type: string
+ - type: integer
+ - type: number
+ - type: boolean
+ - type: 'null'
+ arguments_json:
+ type: string
+ additionalProperties: false
+ required:
+ - call_id
+ - tool_name
+ - arguments
+ title: ToolCall
ToolExecutionStep:
type: object
properties:
@@ -6064,6 +5667,76 @@ components:
title: AgentTurnResponseTurnStartPayload
description: >-
Payload for turn start events in agent turn responses.
+ ImageDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: image
+ default: image
+ description: >-
+ Discriminator type of the delta. Always "image"
+ image:
+ type: string
+ contentEncoding: base64
+ description: The incremental image data as bytes
+ additionalProperties: false
+ required:
+ - type
+ - image
+ title: ImageDelta
+ description: >-
+ An image content delta for streaming responses.
+ TextDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: text
+ default: text
+ description: >-
+ Discriminator type of the delta. Always "text"
+ text:
+ type: string
+ description: The incremental text content
+ additionalProperties: false
+ required:
+ - type
+ - text
+ title: TextDelta
+ description: >-
+ A text content delta for streaming responses.
+ ToolCallDelta:
+ type: object
+ properties:
+ type:
+ type: string
+ const: tool_call
+ default: tool_call
+ description: >-
+ Discriminator type of the delta. Always "tool_call"
+ tool_call:
+ oneOf:
+ - type: string
+ - $ref: '#/components/schemas/ToolCall'
+ description: >-
+ Either an in-progress tool call string or the final parsed tool call
+ parse_status:
+ type: string
+ enum:
+ - started
+ - in_progress
+ - failed
+ - succeeded
+ description: Current parsing status of the tool call
+ additionalProperties: false
+ required:
+ - type
+ - tool_call
+ - parse_status
+ title: ToolCallDelta
+ description: >-
+ A tool call content delta for streaming responses.
OpenAIResponseAnnotationCitation:
type: object
properties:
@@ -7954,6 +7627,28 @@ components:
title: ScoringFnParamsType
description: >-
Types of scoring function parameter configurations.
+ SystemMessage:
+ type: object
+ properties:
+ role:
+ type: string
+ const: system
+ default: system
+ description: >-
+ Must be "system" to identify this as a system message
+ content:
+ $ref: '#/components/schemas/InterleavedContent'
+ description: >-
+ The content of the "system prompt". If multiple system messages are provided,
+ they are concatenated. The underlying Llama Stack code may also add other
+ system messages (for example, for formatting tool definitions).
+ additionalProperties: false
+ required:
+ - role
+ - content
+ title: SystemMessage
+ description: >-
+ A system message providing instructions or context to the model.
EvaluateRowsRequest:
type: object
properties:
@@ -13139,6 +12834,19 @@ components:
- metadata
title: ModerationObjectResults
description: A moderation object.
+ Message:
+ oneOf:
+ - $ref: '#/components/schemas/UserMessage'
+ - $ref: '#/components/schemas/SystemMessage'
+ - $ref: '#/components/schemas/ToolResponseMessage'
+ - $ref: '#/components/schemas/CompletionMessage'
+ discriminator:
+ propertyName: role
+ mapping:
+ user: '#/components/schemas/UserMessage'
+ system: '#/components/schemas/SystemMessage'
+ tool: '#/components/schemas/ToolResponseMessage'
+ assistant: '#/components/schemas/CompletionMessage'
RunShieldRequest:
type: object
properties:
diff --git a/docs/zero_to_hero_guide/00_Inference101.ipynb b/docs/zero_to_hero_guide/00_Inference101.ipynb
index 0da3b702c..6cc714c9e 100644
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@@ -102,15 +102,15 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n",
- " model_id=MODEL_NAME,\n",
+ " model=MODEL_NAME,\n",
")\n",
"\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
@@ -141,14 +141,14 @@
}
],
"source": [
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
" ],\n",
- " model_id=MODEL_NAME, # Changed from model to model_id\n",
+ " model=MODEL_NAME,\n",
")\n",
- "print(response.completion_message.content)"
+ "print(response.choices[0].message.content)"
]
},
{
@@ -218,11 +218,11 @@
" break\n",
"\n",
" message = {\"role\": \"user\", \"content\": user_input}\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id=MODEL_NAME\n",
+ " model=MODEL_NAME\n",
" )\n",
- " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+ " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n",
"# Run the chat loop in a Jupyter Notebook cell using await\n",
"await chat_loop()\n",
@@ -288,16 +288,16 @@
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
" conversation_history.append(user_message)\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=conversation_history,\n",
- " model_id=MODEL_NAME,\n",
+ " model=MODEL_NAME,\n",
" )\n",
- " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+ " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
"\n",
" # Append the assistant message with all required fields\n",
" assistant_message = {\n",
" \"role\": \"user\",\n",
- " \"content\": response.completion_message.content,\n",
+ " \"content\": response.choices[0].message.content,\n",
" # Add any additional required fields here if necessary\n",
" }\n",
" conversation_history.append(assistant_message)\n",
@@ -349,14 +349,14 @@
" }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id=MODEL_NAME,\n",
+ " model=MODEL_NAME,\n",
" stream=stream,\n",
" )\n",
"\n",
" if not stream:\n",
- " cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
+ " cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
" else:\n",
" for log in EventLogger().log(response):\n",
" log.print()\n",
diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
index dc56eee69..24a06bf81 100644
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@@ -134,15 +134,15 @@
" }\n",
" cprint(f'User> {message[\"content\"]}', 'green')\n",
"\n",
- " response = await client.inference.chat_completion(\n",
+ " response = await client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
+ " model='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
" stream=stream,\n",
" )\n",
"\n",
" cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n",
- " cprint(response.completion_message.content, color='yellow')\n",
+ " cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n",
" async for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n",
diff --git a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
index bfc1d8067..80d07447d 100644
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@@ -152,8 +152,8 @@
"metadata": {},
"outputs": [],
"source": [
- "response = client.inference.chat_completion(\n",
- " messages=few_shot_examples, model_id=MODEL_NAME\n",
+ "response = client.chat.completions.create(\n",
+ " messages=few_shot_examples, model=MODEL_NAME\n",
")"
]
},
@@ -164,7 +164,7 @@
"source": [
"#### 4. Display the Model’s Response\n",
"\n",
- "The `completion_message` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
+ "The `choices[0].message.content` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
]
},
{
@@ -184,7 +184,7 @@
"source": [
"from termcolor import cprint\n",
"\n",
- "cprint(f'> Response: {response.completion_message.content}', 'cyan')"
+ "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
]
},
{
@@ -219,7 +219,7 @@
"\n",
"client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n",
"\n",
- "response = client.inference.chat_completion(\n",
+ "response = client.chat.completions.create(\n",
" messages=[\n",
" {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n",
" {\n",
@@ -253,10 +253,10 @@
" \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n",
" }\n",
"],\n",
- " model_id=MODEL_NAME,\n",
+ " model=MODEL_NAME,\n",
")\n",
"\n",
- "cprint(f'> Response: {response.completion_message.content}', 'cyan')"
+ "cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
]
},
{
diff --git a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
index dd866061f..be29800e6 100644
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@@ -102,15 +102,15 @@
" }\n",
"\n",
" cprint(\"User> Sending image for analysis...\", \"green\")\n",
- " response = client.inference.chat_completion(\n",
+ " response = client.chat.completions.create(\n",
" messages=[message],\n",
- " model_id=MODEL_NAME,\n",
+ " model=MODEL_NAME,\n",
" stream=stream,\n",
" )\n",
"\n",
" cprint(f'Assistant> ', color='cyan', end='')\n",
" if not stream:\n",
- " cprint(response.completion_message.content, color='yellow')\n",
+ " cprint(response.choices[0].message.content, color='yellow')\n",
" else:\n",
" for chunk in response:\n",
" cprint(chunk.event.delta.text, color='yellow', end='')\n",
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 4ca9dec72..183038a88 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -131,14 +131,37 @@ After setting up the server, open a new terminal window and configure the llama-
```
**Expected Output:**
```bash
- ChatCompletionResponse(
- completion_message=CompletionMessage(
- content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
- role='assistant',
- stop_reason='end_of_turn',
- tool_calls=[]
- ),
- logprobs=None
+ OpenAIChatCompletion(
+ id='chatcmpl-950',
+ choices=[
+ OpenAIChatCompletionChoice(
+ finish_reason='stop',
+ index=0,
+ message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+ role='assistant',
+ content='...The moon casts silver threads through the velvet night, a silent bard of shadows, ancient and bright.',
+ name=None,
+ tool_calls=None,
+ refusal=None,
+ annotations=None,
+ audio=None,
+ function_call=None
+ ),
+ logprobs=None
+ )
+ ],
+ created=1759240813,
+ model='meta-llama/Llama-3.2-3B-Instruct',
+ object='chat.completion',
+ service_tier=None,
+ system_fingerprint='fp_ollama',
+ usage={
+ 'completion_tokens': 479,
+ 'prompt_tokens': 19,
+ 'total_tokens': 498,
+ 'completion_tokens_details': None,
+ 'prompt_tokens_details': None
+ },
)
```
@@ -147,21 +170,16 @@ After setting up the server, open a new terminal window and configure the llama-
After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
```bash
-curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
+curl http://localhost:$LLAMA_STACK_PORT/v1/chat/completions
-H "Content-Type: application/json"
-d @- < 0
+ assert response.choices[0].message is not None
+ assert isinstance(response.choices[0].message.content, str)
+ assert len(response.choices[0].message.content) > 0
```
### Provider-Specific Tests