mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-03 19:57:35 +00:00
chore: unpublish /inference/chat-completion
This commit is contained in:
parent
6cce553c93
commit
b0e161d3db
23 changed files with 1448 additions and 2137 deletions
25
README.md
25
README.md
|
@ -43,10 +43,21 @@ inference chat-completion \
|
|||
--model-id meta-llama/$MODEL \
|
||||
--message "write a haiku for meta's llama 4 models"
|
||||
|
||||
ChatCompletionResponse(
|
||||
completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
|
||||
logprobs=None,
|
||||
metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
|
||||
OpenAIChatCompletion(
|
||||
...
|
||||
choices=[
|
||||
OpenAIChatCompletionChoice(
|
||||
finish_reason='stop',
|
||||
index=0,
|
||||
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
|
||||
role='assistant',
|
||||
content='...**Silent minds awaken,** \n**Whispers of billions of words,** \n**Reasoning breaks the night.** \n\n— \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
|
||||
...
|
||||
),
|
||||
...
|
||||
)
|
||||
],
|
||||
...
|
||||
)
|
||||
```
|
||||
### Python SDK
|
||||
|
@ -59,14 +70,14 @@ model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
|||
prompt = "Write a haiku about coding"
|
||||
|
||||
print(f"User> {prompt}")
|
||||
response = client.inference.chat_completion(
|
||||
model_id=model_id,
|
||||
response = client.chat.completions.create(
|
||||
model=model_id,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
print(f"Assistant> {response.completion_message.content}")
|
||||
print(f"Assistant> {response.choices[0].message.content}")
|
||||
```
|
||||
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ The playground provides interactive pages for users to explore Llama Stack API c
|
|||
|
||||
**Simple Chat Interface**
|
||||
- Chat directly with Llama models through an intuitive interface
|
||||
- Uses the `/inference/chat-completion` streaming API under the hood
|
||||
- Uses the `/chat/completions` streaming API under the hood
|
||||
- Real-time message streaming for responsive interactions
|
||||
- Perfect for testing model capabilities and prompt engineering
|
||||
|
||||
|
|
|
@ -313,7 +313,7 @@ client = LlamaStackClient(
|
|||
)
|
||||
|
||||
# All API calls will be automatically traced
|
||||
response = client.inference.chat_completion(
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
messages=[{"role": "user", "content": "Hello!"}]
|
||||
)
|
||||
|
@ -327,7 +327,7 @@ with tracer.start_as_current_span("custom_operation") as span:
|
|||
span.set_attribute("user_id", "user123")
|
||||
span.set_attribute("operation_type", "chat_completion")
|
||||
|
||||
response = client.inference.chat_completion(
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
messages=[{"role": "user", "content": "Hello!"}]
|
||||
)
|
||||
|
|
|
@ -216,7 +216,6 @@ from llama_stack_client.types import (
|
|||
|
||||
Methods:
|
||||
|
||||
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
|
||||
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
|
||||
|
||||
## VectorIo
|
||||
|
|
|
@ -543,15 +543,15 @@
|
|||
"source": [
|
||||
"model_id = \"meta-llama/Llama-3.3-70B-Instruct\"\n",
|
||||
"\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" model_id=model_id,\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=model_id,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)\n"
|
||||
"print(response.choices[0].message.content)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -625,16 +625,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
@ -691,16 +691,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
@ -763,9 +763,9 @@
|
|||
"message = {\"role\": \"user\", \"content\": \"Write me a sonnet about llama\"}\n",
|
||||
"print(f'User> {message[\"content\"]}')\n",
|
||||
"\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" stream=True, # <-----------\n",
|
||||
")\n",
|
||||
"\n",
|
||||
|
@ -2917,7 +2917,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
|
@ -2937,11 +2937,11 @@
|
|||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" model_id=vision_model_id,\n",
|
||||
" model=vision_model_id,\n",
|
||||
" stream=False,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -577,15 +577,15 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" model_id=model_id,\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=model_id,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)\n"
|
||||
"print(response.choices[0].message.content)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -673,7 +673,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
|
@ -693,11 +693,11 @@
|
|||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" stream=False,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -767,16 +767,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
@ -831,16 +831,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
|
|
@ -608,15 +608,15 @@
|
|||
"# TODO: update this with a vision model\n",
|
||||
"model_id = \"meta-llama/Llama-4-Maverick-17B-128E-Instruct\"\n",
|
||||
"\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" model_id=model_id,\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=model_id,\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)\n"
|
||||
"print(response.choices[0].message.content)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -704,7 +704,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
|
@ -724,11 +724,11 @@
|
|||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" stream=False,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -798,16 +798,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
@ -862,16 +862,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=model_id,\n",
|
||||
" model=model_id,\n",
|
||||
" )\n",
|
||||
" cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n",
|
||||
" cprint(f\"> Response: {response.choices[0].message.content}\", \"cyan\")\n",
|
||||
"\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"assistant\", # was user\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"stop_reason\": response.completion_message.stop_reason,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" \"stop_reason\": response.choices[0].finish_reason,\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
"\n",
|
||||
|
|
|
@ -3615,7 +3615,7 @@
|
|||
"from rich.pretty import pprint\n",
|
||||
"\n",
|
||||
"response = client.models.register(\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct\",\n",
|
||||
" provider_id=\"ollama\",\n",
|
||||
" provider_model_id=\"llama3.2:3b\",\n",
|
||||
" # base model id\n",
|
||||
|
@ -5762,7 +5762,7 @@
|
|||
"source": [
|
||||
"response = client.models.register(\n",
|
||||
" # the model id here needs to be the finetuned checkpoint identifier\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" provider_id=\"ollama\",\n",
|
||||
" provider_model_id=\"llama_3_2_finetuned:latest\",\n",
|
||||
" # base model id\n",
|
||||
|
@ -5816,14 +5816,14 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" model_id=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Llama-3.2-3B-Instruct-sft-0\",\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": \"What is the primary purpose of a W-2 form in relation to income tax?\"}\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -1003,7 +1003,7 @@
|
|||
"source": [
|
||||
"# register 405B as LLM Judge model\n",
|
||||
"client.models.register(\n",
|
||||
" model_id=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
|
||||
" model=\"meta-llama/Llama-3.1-405B-Instruct\",\n",
|
||||
" provider_model_id=\"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo\",\n",
|
||||
" provider_id=\"together\",\n",
|
||||
")\n",
|
||||
|
|
|
@ -419,21 +419,15 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Test inference\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": sample_prompt}\n",
|
||||
" ],\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=20,\n",
|
||||
" temperature=0.7,\n",
|
||||
")\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -945,20 +939,14 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Test inference\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=sample_messages,\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=20,\n",
|
||||
" temperature=0.7,\n",
|
||||
")\n",
|
||||
"assert response.completion_message.content is not None\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"assert response.choices[0].message.content is not None\n",
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1438,15 +1426,13 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Check inference without guardrails\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=BASE_MODEL,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 150,\n",
|
||||
" }\n",
|
||||
" model=BASE_MODEL,\n",
|
||||
" max_tokens=150,\n",
|
||||
")\n",
|
||||
"assert response.completion_message.content is not None\n",
|
||||
"print(f\"Inference response: {response.completion_message.content}\")"
|
||||
"assert response.choices[0].message.content is not None\n",
|
||||
"print(f\"Inference response: {response.choices[0].message.content}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -687,23 +687,17 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"completion = client.inference.chat_completion(\n",
|
||||
" model_id=CUSTOMIZED_MODEL,\n",
|
||||
"completion = client.chat.completions.create(\n",
|
||||
" model=CUSTOMIZED_MODEL,\n",
|
||||
" messages=test_sample[\"messages\"],\n",
|
||||
" tools=test_sample[\"tools\"],\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 512,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.1,\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" max_tokens=512,\n",
|
||||
" temperature=0.1,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"completion.completion_message.tool_calls"
|
||||
"completion.choices[0].message.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -423,42 +423,30 @@
|
|||
" violation = self.check_guardrails(user_message.get(\"content\"))\n",
|
||||
" \n",
|
||||
" if violation is None:\n",
|
||||
" completion = client.inference.chat_completion(\n",
|
||||
" model_id=self.customized_model,\n",
|
||||
" completion = client.chat.completions.create(\n",
|
||||
" model=self.customized_model,\n",
|
||||
" messages=[user_message],\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 1024,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" \"temperature\": 0.2\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" max_tokens=1024,\n",
|
||||
" temperature=0.2,\n",
|
||||
" )\n",
|
||||
" return completion.completion_message\n",
|
||||
" return completion.choices[0].message.content\n",
|
||||
" else:\n",
|
||||
" return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n",
|
||||
" \n",
|
||||
" elif self.guardrails == \"OFF\":\n",
|
||||
" completion = client.inference.chat_completion(\n",
|
||||
" model_id=self.customized_model,\n",
|
||||
" completion = client.chat.completions.create(\n",
|
||||
" model=self.customized_model,\n",
|
||||
" messages=[user_message],\n",
|
||||
" tools=tools,\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"max_tokens\": 1024,\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"top_p\": 0.7,\n",
|
||||
" \"temperature\": 0.2\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" max_tokens=1024,\n",
|
||||
" temperature=0.2,\n",
|
||||
" )\n",
|
||||
" return completion.completion_message"
|
||||
" return completion.choices[0].message.content"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -60,7 +60,7 @@ client = LlamaStackClient(
|
|||
base_url="http://localhost:8321"
|
||||
)
|
||||
|
||||
response = client.inference.chat_completion(
|
||||
response = client.chat.completions.create(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
|
|
1804
docs/static/llama-stack-spec.html
vendored
1804
docs/static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1372
docs/static/llama-stack-spec.yaml
vendored
1372
docs/static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -102,15 +102,15 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
|
||||
" ],\n",
|
||||
" model_id=MODEL_NAME,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -141,14 +141,14 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"system\", \"content\": \"You are shakespeare.\"},\n",
|
||||
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"}\n",
|
||||
" ],\n",
|
||||
" model_id=MODEL_NAME, # Changed from model to model_id\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
")\n",
|
||||
"print(response.completion_message.content)"
|
||||
"print(response.choices[0].message.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -218,11 +218,11 @@
|
|||
" break\n",
|
||||
"\n",
|
||||
" message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=MODEL_NAME\n",
|
||||
" model=MODEL_NAME\n",
|
||||
" )\n",
|
||||
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
|
||||
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
|
||||
"\n",
|
||||
"# Run the chat loop in a Jupyter Notebook cell using await\n",
|
||||
"await chat_loop()\n",
|
||||
|
@ -288,16 +288,16 @@
|
|||
" user_message = {\"role\": \"user\", \"content\": user_input}\n",
|
||||
" conversation_history.append(user_message)\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=conversation_history,\n",
|
||||
" model_id=MODEL_NAME,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" )\n",
|
||||
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
|
||||
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
|
||||
"\n",
|
||||
" # Append the assistant message with all required fields\n",
|
||||
" assistant_message = {\n",
|
||||
" \"role\": \"user\",\n",
|
||||
" \"content\": response.completion_message.content,\n",
|
||||
" \"content\": response.choices[0].message.content,\n",
|
||||
" # Add any additional required fields here if necessary\n",
|
||||
" }\n",
|
||||
" conversation_history.append(assistant_message)\n",
|
||||
|
@ -349,14 +349,14 @@
|
|||
" }\n",
|
||||
" cprint(f'User> {message[\"content\"]}', 'green')\n",
|
||||
"\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=MODEL_NAME,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" stream=stream,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if not stream:\n",
|
||||
" cprint(f'> Response: {response.completion_message.content}', 'cyan')\n",
|
||||
" cprint(f'> Response: {response.choices[0].message.content}', 'cyan')\n",
|
||||
" else:\n",
|
||||
" for log in EventLogger().log(response):\n",
|
||||
" log.print()\n",
|
||||
|
|
|
@ -134,15 +134,15 @@
|
|||
" }\n",
|
||||
" cprint(f'User> {message[\"content\"]}', 'green')\n",
|
||||
"\n",
|
||||
" response = await client.inference.chat_completion(\n",
|
||||
" response = await client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
|
||||
" model='meta-llama/Llama3.2-11B-Vision-Instruct',\n",
|
||||
" stream=stream,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" cprint(f'Assistant> ', color='cyan', end='')\n",
|
||||
" if not stream:\n",
|
||||
" cprint(response.completion_message.content, color='yellow')\n",
|
||||
" cprint(response.choices[0].message.content, color='yellow')\n",
|
||||
" else:\n",
|
||||
" async for chunk in response:\n",
|
||||
" cprint(chunk.event.delta.text, color='yellow', end='')\n",
|
||||
|
|
|
@ -152,8 +152,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"response = client.inference.chat_completion(\n",
|
||||
" messages=few_shot_examples, model_id=MODEL_NAME\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=few_shot_examples, model=MODEL_NAME\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
@ -164,7 +164,7 @@
|
|||
"source": [
|
||||
"#### 4. Display the Model’s Response\n",
|
||||
"\n",
|
||||
"The `completion_message` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
|
||||
"The `choices[0].message.content` contains the assistant’s generated content based on the few-shot examples provided. Output this content to see the model's response directly in the console.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -184,7 +184,7 @@
|
|||
"source": [
|
||||
"from termcolor import cprint\n",
|
||||
"\n",
|
||||
"cprint(f'> Response: {response.completion_message.content}', 'cyan')"
|
||||
"cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -219,7 +219,7 @@
|
|||
"\n",
|
||||
"client = LlamaStackClient(base_url=f'http://{HOST}:{PORT}')\n",
|
||||
"\n",
|
||||
"response = client.inference.chat_completion(\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" messages=[\n",
|
||||
" {\"role\": \"user\", \"content\": 'Have shorter, spear-shaped ears.'},\n",
|
||||
" {\n",
|
||||
|
@ -253,10 +253,10 @@
|
|||
" \"content\": 'Generally taller and more robust, commonly seen as guard animals.'\n",
|
||||
" }\n",
|
||||
"],\n",
|
||||
" model_id=MODEL_NAME,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"cprint(f'> Response: {response.completion_message.content}', 'cyan')"
|
||||
"cprint(f'> Response: {response.choices[0].message.content}', 'cyan')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -102,15 +102,15 @@
|
|||
" }\n",
|
||||
"\n",
|
||||
" cprint(\"User> Sending image for analysis...\", \"green\")\n",
|
||||
" response = client.inference.chat_completion(\n",
|
||||
" response = client.chat.completions.create(\n",
|
||||
" messages=[message],\n",
|
||||
" model_id=MODEL_NAME,\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
" stream=stream,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" cprint(f'Assistant> ', color='cyan', end='')\n",
|
||||
" if not stream:\n",
|
||||
" cprint(response.completion_message.content, color='yellow')\n",
|
||||
" cprint(response.choices[0].message.content, color='yellow')\n",
|
||||
" else:\n",
|
||||
" for chunk in response:\n",
|
||||
" cprint(chunk.event.delta.text, color='yellow', end='')\n",
|
||||
|
|
|
@ -131,14 +131,37 @@ After setting up the server, open a new terminal window and configure the llama-
|
|||
```
|
||||
**Expected Output:**
|
||||
```bash
|
||||
ChatCompletionResponse(
|
||||
completion_message=CompletionMessage(
|
||||
content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
|
||||
role='assistant',
|
||||
stop_reason='end_of_turn',
|
||||
tool_calls=[]
|
||||
),
|
||||
logprobs=None
|
||||
OpenAIChatCompletion(
|
||||
id='chatcmpl-950',
|
||||
choices=[
|
||||
OpenAIChatCompletionChoice(
|
||||
finish_reason='stop',
|
||||
index=0,
|
||||
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
|
||||
role='assistant',
|
||||
content='...The moon casts silver threads through the velvet night, a silent bard of shadows, ancient and bright.',
|
||||
name=None,
|
||||
tool_calls=None,
|
||||
refusal=None,
|
||||
annotations=None,
|
||||
audio=None,
|
||||
function_call=None
|
||||
),
|
||||
logprobs=None
|
||||
)
|
||||
],
|
||||
created=1759240813,
|
||||
model='meta-llama/Llama-3.2-3B-Instruct',
|
||||
object='chat.completion',
|
||||
service_tier=None,
|
||||
system_fingerprint='fp_ollama',
|
||||
usage={
|
||||
'completion_tokens': 479,
|
||||
'prompt_tokens': 19,
|
||||
'total_tokens': 498,
|
||||
'completion_tokens_details': None,
|
||||
'prompt_tokens_details': None
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
|
@ -147,21 +170,16 @@ After setting up the server, open a new terminal window and configure the llama-
|
|||
After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
|
||||
|
||||
```bash
|
||||
curl http://localhost:$LLAMA_STACK_PORT/alpha/inference/chat-completion
|
||||
curl http://localhost:$LLAMA_STACK_PORT/v1/chat/completions
|
||||
-H "Content-Type: application/json"
|
||||
-d @- <<EOF
|
||||
{
|
||||
"model_id": "$INFERENCE_MODEL",
|
||||
"model": "$INFERENCE_MODEL",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write me a 2-sentence poem about the moon"}
|
||||
],
|
||||
"sampling_params": {
|
||||
"strategy": {
|
||||
"type": "top_p",
|
||||
"temperatrue": 0.7,
|
||||
"top_p": 0.95,
|
||||
},
|
||||
"temperature": 0.7,
|
||||
"seed": 42,
|
||||
"max_tokens": 512
|
||||
}
|
||||
|
@ -174,13 +192,9 @@ You can check the available models with the command `uv run --with llama-stack-c
|
|||
**Expected Output:**
|
||||
```json
|
||||
{
|
||||
"completion_message": {
|
||||
"role": "assistant",
|
||||
"content": "The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
|
||||
"stop_reason": "out_of_tokens",
|
||||
"tool_calls": []
|
||||
},
|
||||
"logprobs": null
|
||||
...
|
||||
"content": "... The moon glows softly in the midnight sky,\nA beacon of wonder, as it catches the eye.",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -213,17 +227,17 @@ if INFERENCE_MODEL is None:
|
|||
# Initialize the clien
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
# Create a chat completion reques
|
||||
response = client.inference.chat_completion(
|
||||
# Create a chat completion request
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a friendly assistant."},
|
||||
{"role": "user", "content": "Write a two-sentence poem about llama."},
|
||||
],
|
||||
model_id=INFERENCE_MODEL,
|
||||
model=INFERENCE_MODEL,
|
||||
)
|
||||
|
||||
# Print the response
|
||||
print(response.completion_message.content)
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
### 3. Run the Python Script
|
||||
|
|
|
@ -1030,7 +1030,6 @@ class InferenceProvider(Protocol):
|
|||
"""
|
||||
...
|
||||
|
||||
@webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
|
||||
async def chat_completion(
|
||||
self,
|
||||
model_id: str,
|
||||
|
|
|
@ -44,8 +44,8 @@ client.initialize()
|
|||
The following example shows how to create a chat completion for an NVIDIA NIM.
|
||||
|
||||
```python
|
||||
response = client.inference.chat_completion(
|
||||
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
|
@ -57,11 +57,9 @@ response = client.inference.chat_completion(
|
|||
},
|
||||
],
|
||||
stream=False,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
},
|
||||
max_tokens=50,
|
||||
)
|
||||
print(f"Response: {response.completion_message.content}")
|
||||
print(f"Response: {response.choices[0].message.content}")
|
||||
```
|
||||
|
||||
### Tool Calling Example ###
|
||||
|
@ -89,15 +87,15 @@ tool_definition = ToolDefinition(
|
|||
},
|
||||
)
|
||||
|
||||
tool_response = client.inference.chat_completion(
|
||||
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
||||
tool_response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=[tool_definition],
|
||||
)
|
||||
|
||||
print(f"Tool Response: {tool_response.completion_message.content}")
|
||||
if tool_response.completion_message.tool_calls:
|
||||
for tool_call in tool_response.completion_message.tool_calls:
|
||||
print(f"Tool Response: {tool_response.choices[0].message.content}")
|
||||
if tool_response.choices[0].message.tool_calls:
|
||||
for tool_call in tool_response.choices[0].message.tool_calls:
|
||||
print(f"Tool Called: {tool_call.tool_name}")
|
||||
print(f"Arguments: {tool_call.arguments}")
|
||||
```
|
||||
|
@ -123,8 +121,8 @@ response_format = JsonSchemaResponseFormat(
|
|||
type=ResponseFormatType.json_schema, json_schema=person_schema
|
||||
)
|
||||
|
||||
structured_response = client.inference.chat_completion(
|
||||
model_id="meta-llama/Llama-3.1-8B-Instruct",
|
||||
structured_response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -134,7 +132,7 @@ structured_response = client.inference.chat_completion(
|
|||
response_format=response_format,
|
||||
)
|
||||
|
||||
print(f"Structured Response: {structured_response.completion_message.content}")
|
||||
print(f"Structured Response: {structured_response.choices[0].message.content}")
|
||||
```
|
||||
|
||||
### Create Embeddings
|
||||
|
@ -167,8 +165,8 @@ def load_image_as_base64(image_path):
|
|||
image_path = {path_to_the_image}
|
||||
demo_image_b64 = load_image_as_base64(image_path)
|
||||
|
||||
vlm_response = client.inference.chat_completion(
|
||||
model_id="nvidia/vila",
|
||||
vlm_response = client.chat.completions.create(
|
||||
model="nvidia/vila",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -188,5 +186,5 @@ vlm_response = client.inference.chat_completion(
|
|||
],
|
||||
)
|
||||
|
||||
print(f"VLM Response: {vlm_response.completion_message.content}")
|
||||
print(f"VLM Response: {vlm_response.choices[0].message.content}")
|
||||
```
|
||||
|
|
|
@ -179,15 +179,15 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.
|
|||
### Basic Test Pattern
|
||||
```python
|
||||
def test_basic_chat_completion(llama_stack_client, text_model_id):
|
||||
response = llama_stack_client.inference.chat_completion(
|
||||
model_id=text_model_id,
|
||||
response = llama_stack_client.chat.completions.create(
|
||||
model=text_model_id,
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
)
|
||||
|
||||
# Test structure, not AI output quality
|
||||
assert response.completion_message is not None
|
||||
assert isinstance(response.completion_message.content, str)
|
||||
assert len(response.completion_message.content) > 0
|
||||
assert response.choices[0].message is not None
|
||||
assert isinstance(response.choices[0].message.content, str)
|
||||
assert len(response.choices[0].message.content) > 0
|
||||
```
|
||||
|
||||
### Provider-Specific Tests
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue