Merge remote-tracking branch 'origin/main' into openai_v1
|
@ -261,7 +261,7 @@ You can even run `llama model prompt-format` see all of the templates and their
|
|||
```
|
||||
llama model prompt-format -m Llama3.2-3B-Instruct
|
||||
```
|
||||

|
||||

|
||||
|
||||
|
||||
You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
|
||||
|
|
|
@ -217,7 +217,6 @@ from llama_stack_client.types import (
|
|||
Methods:
|
||||
|
||||
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
|
||||
- <code title="post /v1/inference/completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
|
||||
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
|
||||
|
||||
## VectorIo
|
||||
|
|
|
@ -824,16 +824,10 @@
|
|||
"\n",
|
||||
"\n",
|
||||
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
|
||||
"response = client.inference.completion(\n",
|
||||
" model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
||||
" content=user_input,\n",
|
||||
" stream=False,\n",
|
||||
" sampling_params={\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"greedy\",\n",
|
||||
" },\n",
|
||||
" \"max_tokens\": 50,\n",
|
||||
" },\n",
|
||||
"response = client.chat.completions.create(\n",
|
||||
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
|
||||
" messages=[{\"role\": \"user\", \"content\": user_input}],\n",
|
||||
" max_tokens=50,\n",
|
||||
" response_format={\n",
|
||||
" \"type\": \"json_schema\",\n",
|
||||
" \"json_schema\": Output.model_json_schema(),\n",
|
||||
|
@ -1013,7 +1007,7 @@
|
|||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/resources/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
|
||||
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/static/img/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Agents are characterized by having access to\n",
|
||||
|
|
|
@ -706,20 +706,15 @@
|
|||
" provider_id=\"nvidia\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.inference.completion(\n",
|
||||
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||
"response = client.completions.create(\n",
|
||||
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||
" stream=False,\n",
|
||||
" model_id=CUSTOMIZED_MODEL_DIR,\n",
|
||||
" sampling_params={\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" },\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" },\n",
|
||||
" model=CUSTOMIZED_MODEL_DIR,\n",
|
||||
" temperature=0.7,\n",
|
||||
" top_p=0.9,\n",
|
||||
" max_tokens=20,\n",
|
||||
")\n",
|
||||
"print(f\"Inference response: {response.content}\")"
|
||||
"print(f\"Inference response: {response.choices[0].text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1233,20 +1228,15 @@
|
|||
" provider_id=\"nvidia\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.inference.completion(\n",
|
||||
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||
"response = client.completions.create(\n",
|
||||
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
|
||||
" stream=False,\n",
|
||||
" model_id=customized_chat_model_dir,\n",
|
||||
" sampling_params={\n",
|
||||
" \"strategy\": {\n",
|
||||
" \"type\": \"top_p\",\n",
|
||||
" \"temperature\": 0.7,\n",
|
||||
" \"top_p\": 0.9\n",
|
||||
" },\n",
|
||||
" \"max_tokens\": 20,\n",
|
||||
" },\n",
|
||||
" model=customized_chat_model_dir,\n",
|
||||
" temperature=0.7,\n",
|
||||
" top_p=0.9,\n",
|
||||
" max_tokens=20,\n",
|
||||
")\n",
|
||||
"print(f\"Inference response: {response.content}\")"
|
||||
"print(f\"Inference response: {response.choices[0].text}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Before Width: | Height: | Size: 128 KiB After Width: | Height: | Size: 128 KiB |
Before Width: | Height: | Size: 220 KiB After Width: | Height: | Size: 220 KiB |
Before Width: | Height: | Size: 71 KiB After Width: | Height: | Size: 71 KiB |
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 170 KiB After Width: | Height: | Size: 170 KiB |
120
docs/static/llama-stack-spec.html
vendored
|
@ -1239,50 +1239,6 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/inference/embeddings": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/EmbeddingsResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate embeddings for content pieces using the specified model.",
|
||||
"description": "Generate embeddings for content pieces using the specified model.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/EmbeddingsRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -6965,7 +6921,7 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/inference/rerank": {
|
||||
"/v1alpha/inference/rerank": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -12081,80 +12037,6 @@
|
|||
"title": "OpenAIDeleteResponseObject",
|
||||
"description": "Response object confirming deletion of an OpenAI response."
|
||||
},
|
||||
"EmbeddingsRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
|
||||
},
|
||||
"contents": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/InterleavedContentItem"
|
||||
}
|
||||
}
|
||||
],
|
||||
"description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
|
||||
},
|
||||
"text_truncation": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"none",
|
||||
"start",
|
||||
"end"
|
||||
],
|
||||
"description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
|
||||
},
|
||||
"output_dimension": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
|
||||
},
|
||||
"task_type": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"query",
|
||||
"document"
|
||||
],
|
||||
"description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model_id",
|
||||
"contents"
|
||||
],
|
||||
"title": "EmbeddingsRequest"
|
||||
},
|
||||
"EmbeddingsResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"embeddings": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"embeddings"
|
||||
],
|
||||
"title": "EmbeddingsResponse",
|
||||
"description": "Response containing generated embeddings."
|
||||
},
|
||||
"AgentCandidate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
103
docs/static/llama-stack-spec.yaml
vendored
|
@ -861,41 +861,6 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/inference/embeddings:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
An array of embeddings, one for each content. Each embedding is a list
|
||||
of floats. The dimensionality of the embedding is model-specific; you
|
||||
can check model metadata using /models/{model_id}.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/EmbeddingsResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate embeddings for content pieces using the specified model.
|
||||
description: >-
|
||||
Generate embeddings for content pieces using the specified model.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/EmbeddingsRequest'
|
||||
required: true
|
||||
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
|
||||
post:
|
||||
responses:
|
||||
|
@ -5040,7 +5005,7 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/QueryTracesRequest'
|
||||
required: true
|
||||
/v1/inference/rerank:
|
||||
/v1alpha/inference/rerank:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -8937,72 +8902,6 @@ components:
|
|||
title: OpenAIDeleteResponseObject
|
||||
description: >-
|
||||
Response object confirming deletion of an OpenAI response.
|
||||
EmbeddingsRequest:
|
||||
type: object
|
||||
properties:
|
||||
model_id:
|
||||
type: string
|
||||
description: >-
|
||||
The identifier of the model to use. The model must be an embedding model
|
||||
registered with Llama Stack and available via the /models endpoint.
|
||||
contents:
|
||||
oneOf:
|
||||
- type: array
|
||||
items:
|
||||
type: string
|
||||
- type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/InterleavedContentItem'
|
||||
description: >-
|
||||
List of contents to generate embeddings for. Each content can be a string
|
||||
or an InterleavedContentItem (and hence can be multimodal). The behavior
|
||||
depends on the model and provider. Some models may only support text.
|
||||
text_truncation:
|
||||
type: string
|
||||
enum:
|
||||
- none
|
||||
- start
|
||||
- end
|
||||
description: >-
|
||||
(Optional) Config for how to truncate text for embedding when text is
|
||||
longer than the model's max sequence length.
|
||||
output_dimension:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Output dimensionality for the embeddings. Only supported by
|
||||
Matryoshka models.
|
||||
task_type:
|
||||
type: string
|
||||
enum:
|
||||
- query
|
||||
- document
|
||||
description: >-
|
||||
(Optional) How is the embedding being used? This is only supported by
|
||||
asymmetric embedding models.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model_id
|
||||
- contents
|
||||
title: EmbeddingsRequest
|
||||
EmbeddingsResponse:
|
||||
type: object
|
||||
properties:
|
||||
embeddings:
|
||||
type: array
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
type: number
|
||||
description: >-
|
||||
List of embedding vectors, one per input content. Each embedding is a
|
||||
list of floats. The dimensionality of the embedding is model-specific;
|
||||
you can check model metadata using /models/{model_id}
|
||||
additionalProperties: false
|
||||
required:
|
||||
- embeddings
|
||||
title: EmbeddingsResponse
|
||||
description: >-
|
||||
Response containing generated embeddings.
|
||||
AgentCandidate:
|
||||
type: object
|
||||
properties:
|
||||
|
|