Merge remote-tracking branch 'origin/main' into openai_v1

This commit is contained in:
Ashwin Bharambe 2025-09-29 13:41:11 -07:00
commit 35546386a2
52 changed files with 580 additions and 802 deletions

View file

@ -261,7 +261,7 @@ You can even run `llama model prompt-format` see all of the templates and their
```
llama model prompt-format -m Llama3.2-3B-Instruct
```
![alt text](../../../resources/prompt-format.png)
![alt text](/img/prompt-format.png)
You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.

View file

@ -217,7 +217,6 @@ from llama_stack_client.types import (
Methods:
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /v1/inference/completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
## VectorIo

View file

@ -824,16 +824,10 @@
"\n",
"\n",
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
"response = client.inference.completion(\n",
" model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" content=user_input,\n",
" stream=False,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"greedy\",\n",
" },\n",
" \"max_tokens\": 50,\n",
" },\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" messages=[{\"role\": \"user\", \"content\": user_input}],\n",
" max_tokens=50,\n",
" response_format={\n",
" \"type\": \"json_schema\",\n",
" \"json_schema\": Output.model_json_schema(),\n",
@ -1013,7 +1007,7 @@
"\n",
"\n",
"\n",
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/resources/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/static/img/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
"\n",
"\n",
"Agents are characterized by having access to\n",

View file

@ -706,20 +706,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=CUSTOMIZED_MODEL_DIR,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=CUSTOMIZED_MODEL_DIR,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
@ -1233,20 +1228,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=customized_chat_model_dir,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=customized_chat_model_dir,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{

View file

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 220 KiB

After

Width:  |  Height:  |  Size: 220 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 71 KiB

After

Width:  |  Height:  |  Size: 71 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 170 KiB

After

Width:  |  Height:  |  Size: 170 KiB

Before After
Before After

View file

@ -1239,50 +1239,6 @@
]
}
},
"/v1/inference/embeddings": {
"post": {
"responses": {
"200": {
"description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EmbeddingsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate embeddings for content pieces using the specified model.",
"description": "Generate embeddings for content pieces using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EmbeddingsRequest"
}
}
},
"required": true
}
}
},
"/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
@ -6965,7 +6921,7 @@
}
}
},
"/v1/inference/rerank": {
"/v1alpha/inference/rerank": {
"post": {
"responses": {
"200": {
@ -12081,80 +12037,6 @@
"title": "OpenAIDeleteResponseObject",
"description": "Response object confirming deletion of an OpenAI response."
},
"EmbeddingsRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
},
"contents": {
"oneOf": [
{
"type": "array",
"items": {
"type": "string"
}
},
{
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContentItem"
}
}
],
"description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
},
"text_truncation": {
"type": "string",
"enum": [
"none",
"start",
"end"
],
"description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
},
"output_dimension": {
"type": "integer",
"description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
},
"task_type": {
"type": "string",
"enum": [
"query",
"document"
],
"description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
}
},
"additionalProperties": false,
"required": [
"model_id",
"contents"
],
"title": "EmbeddingsRequest"
},
"EmbeddingsResponse": {
"type": "object",
"properties": {
"embeddings": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "number"
}
},
"description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
}
},
"additionalProperties": false,
"required": [
"embeddings"
],
"title": "EmbeddingsResponse",
"description": "Response containing generated embeddings."
},
"AgentCandidate": {
"type": "object",
"properties": {

View file

@ -861,41 +861,6 @@ paths:
required: true
schema:
type: string
/v1/inference/embeddings:
post:
responses:
'200':
description: >-
An array of embeddings, one for each content. Each embedding is a list
of floats. The dimensionality of the embedding is model-specific; you
can check model metadata using /models/{model_id}.
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate embeddings for content pieces using the specified model.
description: >-
Generate embeddings for content pieces using the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
@ -5040,7 +5005,7 @@ paths:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
/v1/inference/rerank:
/v1alpha/inference/rerank:
post:
responses:
'200':
@ -8937,72 +8902,6 @@ components:
title: OpenAIDeleteResponseObject
description: >-
Response object confirming deletion of an OpenAI response.
EmbeddingsRequest:
type: object
properties:
model_id:
type: string
description: >-
The identifier of the model to use. The model must be an embedding model
registered with Llama Stack and available via the /models endpoint.
contents:
oneOf:
- type: array
items:
type: string
- type: array
items:
$ref: '#/components/schemas/InterleavedContentItem'
description: >-
List of contents to generate embeddings for. Each content can be a string
or an InterleavedContentItem (and hence can be multimodal). The behavior
depends on the model and provider. Some models may only support text.
text_truncation:
type: string
enum:
- none
- start
- end
description: >-
(Optional) Config for how to truncate text for embedding when text is
longer than the model's max sequence length.
output_dimension:
type: integer
description: >-
(Optional) Output dimensionality for the embeddings. Only supported by
Matryoshka models.
task_type:
type: string
enum:
- query
- document
description: >-
(Optional) How is the embedding being used? This is only supported by
asymmetric embedding models.
additionalProperties: false
required:
- model_id
- contents
title: EmbeddingsRequest
EmbeddingsResponse:
type: object
properties:
embeddings:
type: array
items:
type: array
items:
type: number
description: >-
List of embedding vectors, one per input content. Each embedding is a
list of floats. The dimensionality of the embedding is model-specific;
you can check model metadata using /models/{model_id}
additionalProperties: false
required:
- embeddings
title: EmbeddingsResponse
description: >-
Response containing generated embeddings.
AgentCandidate:
type: object
properties: